first draft

This commit is contained in:
Kai Wu 2025-07-25 10:41:06 -07:00
parent 025163d8e6
commit e614241876
9 changed files with 64 additions and 60 deletions

View file

@ -11,8 +11,8 @@ set -euo pipefail
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
K8S_DIR="${SCRIPT_DIR}/../k8s" K8S_DIR="${SCRIPT_DIR}/../k8s"
echo "Setting up AWS EKS-specific storage class..." # echo "Setting up AWS EKS-specific storage class..."
kubectl apply -f gp3-topology-aware.yaml # kubectl apply -f gp3-topology-aware.yaml
echo "Running main Kubernetes deployment..." echo "Running main Kubernetes deployment..."
cd "${K8S_DIR}" cd "${K8S_DIR}"

View file

@ -9,7 +9,7 @@ parameters:
type: gp3 type: gp3
iops: "3000" iops: "3000"
throughput: "125" throughput: "125"
provisioner: ebs.csi.aws.com provisioner: ebs.csi.eks.aws.com
reclaimPolicy: Delete reclaimPolicy: Delete
volumeBindingMode: WaitForFirstConsumer volumeBindingMode: WaitForFirstConsumer
allowVolumeExpansion: true allowVolumeExpansion: true

View file

@ -13,6 +13,9 @@ export POSTGRES_PASSWORD=llamastack
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
# Set USE_EBS to false if you don't have permission to use EKS EBS
export USE_EBS=${USE_EBS:-false}
# HF_TOKEN should be set by the user; base64 encode it for the secret # HF_TOKEN should be set by the user; base64 encode it for the secret
if [ -n "${HF_TOKEN:-}" ]; then if [ -n "${HF_TOKEN:-}" ]; then
export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64) export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64)
@ -47,17 +50,37 @@ if [ -n "${HF_TOKEN:-}" ]; then
envsubst < ./hf-token-secret.yaml.template | kubectl apply -f - envsubst < ./hf-token-secret.yaml.template | kubectl apply -f -
fi fi
envsubst < ./vllm-k8s.yaml.template | kubectl apply -f - # Apply templates with appropriate storage configuration based on USE_EBS setting
envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f - if [ "$USE_EBS" = "true" ]; then
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f - echo "Using EBS storage for persistent volumes"
envsubst < ./chroma-k8s.yaml.template | kubectl apply -f - envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \ kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
--dry-run=client -o yaml > stack-configmap.yaml --dry-run=client -o yaml > stack-configmap.yaml
kubectl apply -f stack-configmap.yaml kubectl apply -f stack-configmap.yaml
envsubst < ./stack-k8s.yaml.template | kubectl apply -f - envsubst < ./stack-k8s.yaml.template | kubectl apply -f -
envsubst < ./ingress-k8s.yaml.template | kubectl apply -f - envsubst < ./ingress-k8s.yaml.template | kubectl apply -f -
envsubst < ./ui-k8s.yaml.template | kubectl apply -f -
else
echo "Using emptyDir for storage (data will not persist across pod restarts)"
# Process templates to replace EBS storage with emptyDir
envsubst < ./vllm-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
envsubst < ./vllm-safety-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
envsubst < ./ui-k8s.yaml.template | kubectl apply -f - kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
--dry-run=client -o yaml > stack-configmap.yaml
kubectl apply -f stack-configmap.yaml
# Apply the same emptyDir transformation to the remaining templates
envsubst < ./stack-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
envsubst < ./ingress-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
envsubst < ./ui-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
fi

View file

@ -122,9 +122,6 @@ data:
provider_id: rag-runtime provider_id: rag-runtime
server: server:
port: 8321 port: 8321
auth:
provider_config:
type: github_token
kind: ConfigMap kind: ConfigMap
metadata: metadata:
creationTimestamp: null creationTimestamp: null

View file

@ -25,6 +25,13 @@ spec:
app.kubernetes.io/name: llama-stack app.kubernetes.io/name: llama-stack
app.kubernetes.io/component: server app.kubernetes.io/component: server
spec: spec:
initContainers:
- name: wait-for-vllm-server
image: busybox:1.28
command: ['sh', '-c', 'until nc -z vllm-server.default.svc.cluster.local 8000; do echo waiting for vllm-server on port 8000; sleep 2; done;']
- name: wait-for-vllm-server-safety
image: busybox:1.28
command: ['sh', '-c', 'until nc -z vllm-server-safety.default.svc.cluster.local 8001; do echo waiting for vllm-server-safety on port 8001; sleep 2; done;']
containers: containers:
- name: llama-stack - name: llama-stack
image: llamastack/distribution-starter:latest image: llamastack/distribution-starter:latest

View file

@ -119,6 +119,3 @@ tool_groups:
provider_id: rag-runtime provider_id: rag-runtime
server: server:
port: 8321 port: 8321
auth:
provider_config:
type: github_token

View file

@ -19,50 +19,30 @@ spec:
spec: spec:
containers: containers:
- name: llama-stack-ui - name: llama-stack-ui
image: node:18-alpine image: python:3.12-slim
command: ["/bin/sh"]
env: env:
- name: LLAMA_STACK_BACKEND_URL - name: LLAMA_STACK_BACKEND_URL
value: "http://llama-stack-service:8321" value: "http://llama-stack-service:8321"
- name: LLAMA_STACK_UI_PORT - name: LLAMA_STACK_ENDPOINT
value: "8322" value: "http://llama-stack-service:8321"
- name: GITHUB_CLIENT_ID workingDir: /app
value: "${GITHUB_CLIENT_ID}" command: ["/bin/sh"]
- name: GITHUB_CLIENT_SECRET
value: "${GITHUB_CLIENT_SECRET}"
- name: NEXTAUTH_URL
value: "${LLAMA_STACK_UI_URL}:8322"
args: args:
- -c - -c
- | - |
# Install git (not included in alpine by default) # Install pip and git
apk add --no-cache git /usr/local/bin/python -m pip install --upgrade pip
apt-get update && apt-get install -y git
# Clone the repository # Clone the repository
echo "Cloning repository..."
git clone https://github.com/meta-llama/llama-stack.git /app git clone https://github.com/meta-llama/llama-stack.git /app
# Navigate to the UI directory # Navigate to the playground directory
echo "Navigating to UI directory..." cd /app/llama_stack/distribution/ui
cd /app/llama_stack/ui
# Check if package.json exists # Install requirements
if [ ! -f "package.json" ]; then pip install -r requirements.txt
echo "ERROR: package.json not found in $(pwd)"
ls -la
exit 1
fi
# Install dependencies with verbose output # Run the Streamlit app
echo "Installing dependencies..." streamlit run app.py --server.port=8322 --server.address=0.0.0.0
npm install --verbose
# Verify next is installed
echo "Checking if next is installed..."
npx next --version || echo "Next.js not found, checking node_modules..."
ls -la node_modules/.bin/ | grep next || echo "No next binary found"
npm run dev
ports: ports:
- containerPort: 8322 - containerPort: 8501
workingDir: /app

View file

@ -25,14 +25,16 @@ spec:
app.kubernetes.io/name: vllm app.kubernetes.io/name: vllm
workload-type: inference workload-type: inference
spec: spec:
nodeSelector: # Removed nodeSelector for GPU nodes as they don't appear to exist in the cluster
eks.amazonaws.com/nodegroup: gpu # If you have GPU nodes with a different label, you can uncomment and modify this section
# nodeSelector:
# <your-gpu-node-label-key>: <your-gpu-node-label-value>
containers: containers:
- name: vllm - name: vllm
image: vllm/vllm-openai:latest image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"] command: ["/bin/sh", "-c"]
args: args:
- "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.6 --enable-auto-tool-choice --tool-call-parser llama4_pythonic" - "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.7 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 4"
env: env:
- name: INFERENCE_MODEL - name: INFERENCE_MODEL
value: "${INFERENCE_MODEL}" value: "${INFERENCE_MODEL}"

View file

@ -25,14 +25,12 @@ spec:
app.kubernetes.io/name: vllm-safety app.kubernetes.io/name: vllm-safety
workload-type: inference workload-type: inference
spec: spec:
nodeSelector:
eks.amazonaws.com/nodegroup: gpu
containers: containers:
- name: vllm-safety - name: vllm-safety
image: vllm/vllm-openai:latest image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"] command: ["/bin/sh", "-c"]
args: [ args: [
"vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.3" "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.6"
] ]
env: env:
- name: SAFETY_MODEL - name: SAFETY_MODEL