diff --git a/docs/source/distributions/k8s/apply.sh b/docs/source/distributions/k8s/apply.sh index 7ff7d28eb..b29e18d2a 100755 --- a/docs/source/distributions/k8s/apply.sh +++ b/docs/source/distributions/k8s/apply.sh @@ -16,6 +16,14 @@ export SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B} set -euo pipefail set -x +# Install NVIDIA device plugin for GPU support +echo "Installing NVIDIA device plugin..." +kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/refs/tags/v0.17.2/deployments/static/nvidia-device-plugin.yml + +# Wait for NVIDIA device plugin to be ready +echo "Waiting for NVIDIA device plugin to be ready..." +kubectl wait --for=condition=ready pod -l name=nvidia-device-plugin-ds -n kube-system --timeout=300s + envsubst < ./vllm-k8s.yaml.template | kubectl apply -f - envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f - envsubst < ./postgres-k8s.yaml.template | kubectl apply -f - diff --git a/docs/source/distributions/k8s/chroma-k8s.yaml.template b/docs/source/distributions/k8s/chroma-k8s.yaml.template index a2a5e3be3..2083a566b 100644 --- a/docs/source/distributions/k8s/chroma-k8s.yaml.template +++ b/docs/source/distributions/k8s/chroma-k8s.yaml.template @@ -5,6 +5,7 @@ metadata: spec: accessModes: - ReadWriteOnce + storageClassName: gp2 resources: requests: storage: 20Gi @@ -23,6 +24,8 @@ spec: labels: app: chromadb spec: + nodeSelector: + eks.amazonaws.com/nodegroup: cpu containers: - name: chromadb image: chromadb/chroma:latest diff --git a/docs/source/distributions/k8s/postgres-k8s.yaml.template b/docs/source/distributions/k8s/postgres-k8s.yaml.template index 86a765652..66e197b15 100644 --- a/docs/source/distributions/k8s/postgres-k8s.yaml.template +++ b/docs/source/distributions/k8s/postgres-k8s.yaml.template @@ -5,6 +5,7 @@ metadata: spec: accessModes: - ReadWriteOnce + storageClassName: gp2 resources: requests: storage: 10Gi @@ -23,6 +24,8 @@ spec: labels: app.kubernetes.io/name: postgres spec: + nodeSelector: + eks.amazonaws.com/nodegroup: cpu containers: - name: postgres image: postgres:15 diff --git a/docs/source/distributions/k8s/stack-k8s.yaml.template b/docs/source/distributions/k8s/stack-k8s.yaml.template index 1cfc63ef5..44f69f69f 100644 --- a/docs/source/distributions/k8s/stack-k8s.yaml.template +++ b/docs/source/distributions/k8s/stack-k8s.yaml.template @@ -5,6 +5,7 @@ metadata: spec: accessModes: - ReadWriteOnce + storageClassName: gp2 resources: requests: storage: 1Gi @@ -25,9 +26,11 @@ spec: app.kubernetes.io/name: llama-stack app.kubernetes.io/component: server spec: + nodeSelector: + eks.amazonaws.com/nodegroup: cpu containers: - name: llama-stack - image: llamastack/distribution-remote-vllm:latest + image: llamastack/distribution-postgres-demo:latest imagePullPolicy: Always # since we have specified latest instead of a version env: - name: ENABLE_CHROMADB diff --git a/docs/source/distributions/k8s/ui-k8s.yaml.template b/docs/source/distributions/k8s/ui-k8s.yaml.template index ef1bf0c55..ca429c029 100644 --- a/docs/source/distributions/k8s/ui-k8s.yaml.template +++ b/docs/source/distributions/k8s/ui-k8s.yaml.template @@ -17,6 +17,8 @@ spec: app.kubernetes.io/name: llama-stack app.kubernetes.io/component: ui spec: + nodeSelector: + eks.amazonaws.com/nodegroup: cpu containers: - name: llama-stack-ui image: node:18-alpine diff --git a/docs/source/distributions/k8s/vllm-k8s.yaml.template b/docs/source/distributions/k8s/vllm-k8s.yaml.template index 6256cc7e1..3988066b2 100644 --- a/docs/source/distributions/k8s/vllm-k8s.yaml.template +++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template @@ -6,6 +6,7 @@ spec: accessModes: - ReadWriteOnce volumeMode: Filesystem + storageClassName: gp2 resources: requests: storage: 50Gi @@ -25,16 +26,8 @@ spec: app.kubernetes.io/name: vllm workload-type: inference spec: - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: workload-type - operator: In - values: - - inference - topologyKey: kubernetes.io/hostname # Ensures no two inference pods on same node + nodeSelector: + eks.amazonaws.com/nodegroup: gpu containers: - name: vllm image: vllm/vllm-openai:latest @@ -49,6 +42,11 @@ spec: key: token ports: - containerPort: 8000 + resources: + requests: + nvidia.com/gpu: 1 + limits: + nvidia.com/gpu: 1 volumeMounts: - name: llama-storage mountPath: /root/.cache/huggingface diff --git a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template index 8857e83b6..9bce4aa95 100644 --- a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template +++ b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template @@ -26,16 +26,8 @@ spec: app.kubernetes.io/name: vllm-safety workload-type: inference spec: - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: workload-type - operator: In - values: - - inference - topologyKey: kubernetes.io/hostname # Ensures no two inference pods on same node + nodeSelector: + eks.amazonaws.com/nodegroup: gpu containers: - name: vllm-safety image: vllm/vllm-openai:latest