make it work on gpus

2025-12-16 13:29:25 +00:00 · 2025-06-24 12:20:04 -07:00 · 2025-06-24 12:20:04 -07:00 · f99ca37f91
commit f99ca37f91
parent ee96c4891b
7 changed files with 30 additions and 21 deletions
--- a/docs/source/distributions/k8s/apply.sh
+++ b/docs/source/distributions/k8s/apply.sh
@ -16,6 +16,14 @@ export SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
 set -euo pipefail
 set -x
 # Install NVIDIA device plugin for GPU support
 echo "Installing NVIDIA device plugin..."
 kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/refs/tags/v0.17.2/deployments/static/nvidia-device-plugin.yml
 # Wait for NVIDIA device plugin to be ready
 echo "Waiting for NVIDIA device plugin to be ready..."
 kubectl wait --for=condition=ready pod -l name=nvidia-device-plugin-ds -n kube-system --timeout=300s
 envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
 envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
 envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
--- a/docs/source/distributions/k8s/chroma-k8s.yaml.template
+++ b/docs/source/distributions/k8s/chroma-k8s.yaml.template
@ -5,6 +5,7 @@ metadata:
 spec:
  accessModes:
    - ReadWriteOnce
  storageClassName: gp2
  resources:
    requests:
      storage: 20Gi
@ -23,6 +24,8 @@ spec:
      labels:
        app: chromadb
    spec:
      nodeSelector:
        eks.amazonaws.com/nodegroup: cpu
      containers:
      - name: chromadb
        image: chromadb/chroma:latest
--- a/docs/source/distributions/k8s/postgres-k8s.yaml.template
+++ b/docs/source/distributions/k8s/postgres-k8s.yaml.template
@ -5,6 +5,7 @@ metadata:
 spec:
  accessModes:
    - ReadWriteOnce
  storageClassName: gp2
  resources:
    requests:
      storage: 10Gi
@ -23,6 +24,8 @@ spec:
      labels:
        app.kubernetes.io/name: postgres
    spec:
      nodeSelector:
        eks.amazonaws.com/nodegroup: cpu
      containers:
      - name: postgres
        image: postgres:15
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@ -5,6 +5,7 @@ metadata:
 spec:
  accessModes:
    - ReadWriteOnce
  storageClassName: gp2
  resources:
    requests:
      storage: 1Gi
@ -25,9 +26,11 @@ spec:
        app.kubernetes.io/name: llama-stack
        app.kubernetes.io/component: server
    spec:
      nodeSelector:
        eks.amazonaws.com/nodegroup: cpu
      containers:
      - name: llama-stack
-        image: llamastack/distribution-remote-vllm:latest
+        image: llamastack/distribution-postgres-demo:latest
        imagePullPolicy: Always # since we have specified latest instead of a version
        env:
        - name: ENABLE_CHROMADB
--- a/docs/source/distributions/k8s/ui-k8s.yaml.template
+++ b/docs/source/distributions/k8s/ui-k8s.yaml.template
@ -17,6 +17,8 @@ spec:
        app.kubernetes.io/name: llama-stack
        app.kubernetes.io/component: ui
    spec:
      nodeSelector:
        eks.amazonaws.com/nodegroup: cpu
      containers:
      - name: llama-stack-ui
        image: node:18-alpine
--- a/docs/source/distributions/k8s/vllm-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template
@ -6,6 +6,7 @@ spec:
  accessModes:
    - ReadWriteOnce
  volumeMode: Filesystem
  storageClassName: gp2
  resources:
    requests:
      storage: 50Gi
@ -25,16 +26,8 @@ spec:
        app.kubernetes.io/name: vllm
        workload-type: inference
    spec:
-      affinity:
+      nodeSelector:
-        podAntiAffinity:
+        eks.amazonaws.com/nodegroup: gpu
          requiredDuringSchedulingIgnoredDuringExecution:
          - labelSelector:
              matchExpressions:
              - key: workload-type
                operator: In
                values:
                - inference
            topologyKey: kubernetes.io/hostname  # Ensures no two inference pods on same node
      containers:
      - name: vllm
        image: vllm/vllm-openai:latest
@ -49,6 +42,11 @@ spec:
              key: token
        ports:
          - containerPort: 8000
        resources:
          requests:
            nvidia.com/gpu: 1
          limits:
            nvidia.com/gpu: 1
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.cache/huggingface
--- a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template
@ -26,16 +26,8 @@ spec:
        app.kubernetes.io/name: vllm-safety
        workload-type: inference
    spec:
-      affinity:
+      nodeSelector:
-        podAntiAffinity:
+        eks.amazonaws.com/nodegroup: gpu
          requiredDuringSchedulingIgnoredDuringExecution:
          - labelSelector:
              matchExpressions:
              - key: workload-type
                operator: In
                values:
                - inference
            topologyKey: kubernetes.io/hostname  # Ensures no two inference pods on same node
      containers:
      - name: vllm-safety
        image: vllm/vllm-openai:latest