make it work on gpus

This commit is contained in:
Raghotham Murthy 2025-06-24 12:20:04 -07:00
parent ee96c4891b
commit f99ca37f91
7 changed files with 30 additions and 21 deletions

View file

@ -16,6 +16,14 @@ export SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
set -euo pipefail set -euo pipefail
set -x set -x
# Install NVIDIA device plugin for GPU support
echo "Installing NVIDIA device plugin..."
kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/refs/tags/v0.17.2/deployments/static/nvidia-device-plugin.yml
# Wait for NVIDIA device plugin to be ready
echo "Waiting for NVIDIA device plugin to be ready..."
kubectl wait --for=condition=ready pod -l name=nvidia-device-plugin-ds -n kube-system --timeout=300s
envsubst < ./vllm-k8s.yaml.template | kubectl apply -f - envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f - envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f - envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -

View file

@ -5,6 +5,7 @@ metadata:
spec: spec:
accessModes: accessModes:
- ReadWriteOnce - ReadWriteOnce
storageClassName: gp2
resources: resources:
requests: requests:
storage: 20Gi storage: 20Gi
@ -23,6 +24,8 @@ spec:
labels: labels:
app: chromadb app: chromadb
spec: spec:
nodeSelector:
eks.amazonaws.com/nodegroup: cpu
containers: containers:
- name: chromadb - name: chromadb
image: chromadb/chroma:latest image: chromadb/chroma:latest

View file

@ -5,6 +5,7 @@ metadata:
spec: spec:
accessModes: accessModes:
- ReadWriteOnce - ReadWriteOnce
storageClassName: gp2
resources: resources:
requests: requests:
storage: 10Gi storage: 10Gi
@ -23,6 +24,8 @@ spec:
labels: labels:
app.kubernetes.io/name: postgres app.kubernetes.io/name: postgres
spec: spec:
nodeSelector:
eks.amazonaws.com/nodegroup: cpu
containers: containers:
- name: postgres - name: postgres
image: postgres:15 image: postgres:15

View file

@ -5,6 +5,7 @@ metadata:
spec: spec:
accessModes: accessModes:
- ReadWriteOnce - ReadWriteOnce
storageClassName: gp2
resources: resources:
requests: requests:
storage: 1Gi storage: 1Gi
@ -25,9 +26,11 @@ spec:
app.kubernetes.io/name: llama-stack app.kubernetes.io/name: llama-stack
app.kubernetes.io/component: server app.kubernetes.io/component: server
spec: spec:
nodeSelector:
eks.amazonaws.com/nodegroup: cpu
containers: containers:
- name: llama-stack - name: llama-stack
image: llamastack/distribution-remote-vllm:latest image: llamastack/distribution-postgres-demo:latest
imagePullPolicy: Always # since we have specified latest instead of a version imagePullPolicy: Always # since we have specified latest instead of a version
env: env:
- name: ENABLE_CHROMADB - name: ENABLE_CHROMADB

View file

@ -17,6 +17,8 @@ spec:
app.kubernetes.io/name: llama-stack app.kubernetes.io/name: llama-stack
app.kubernetes.io/component: ui app.kubernetes.io/component: ui
spec: spec:
nodeSelector:
eks.amazonaws.com/nodegroup: cpu
containers: containers:
- name: llama-stack-ui - name: llama-stack-ui
image: node:18-alpine image: node:18-alpine

View file

@ -6,6 +6,7 @@ spec:
accessModes: accessModes:
- ReadWriteOnce - ReadWriteOnce
volumeMode: Filesystem volumeMode: Filesystem
storageClassName: gp2
resources: resources:
requests: requests:
storage: 50Gi storage: 50Gi
@ -25,16 +26,8 @@ spec:
app.kubernetes.io/name: vllm app.kubernetes.io/name: vllm
workload-type: inference workload-type: inference
spec: spec:
affinity: nodeSelector:
podAntiAffinity: eks.amazonaws.com/nodegroup: gpu
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: workload-type
operator: In
values:
- inference
topologyKey: kubernetes.io/hostname # Ensures no two inference pods on same node
containers: containers:
- name: vllm - name: vllm
image: vllm/vllm-openai:latest image: vllm/vllm-openai:latest
@ -49,6 +42,11 @@ spec:
key: token key: token
ports: ports:
- containerPort: 8000 - containerPort: 8000
resources:
requests:
nvidia.com/gpu: 1
limits:
nvidia.com/gpu: 1
volumeMounts: volumeMounts:
- name: llama-storage - name: llama-storage
mountPath: /root/.cache/huggingface mountPath: /root/.cache/huggingface

View file

@ -26,16 +26,8 @@ spec:
app.kubernetes.io/name: vllm-safety app.kubernetes.io/name: vllm-safety
workload-type: inference workload-type: inference
spec: spec:
affinity: nodeSelector:
podAntiAffinity: eks.amazonaws.com/nodegroup: gpu
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: workload-type
operator: In
values:
- inference
topologyKey: kubernetes.io/hostname # Ensures no two inference pods on same node
containers: containers:
- name: vllm-safety - name: vllm-safety
image: vllm/vllm-openai:latest image: vllm/vllm-openai:latest