From e614241876282942b163245251f41a2b42c66878 Mon Sep 17 00:00:00 2001 From: Kai Wu Date: Fri, 25 Jul 2025 10:41:06 -0700 Subject: [PATCH] first draft --- docs/source/distributions/eks/apply.sh | 4 +- .../distributions/eks/gp3-topology-aware.yaml | 2 +- docs/source/distributions/k8s/apply.sh | 43 ++++++++++++---- .../distributions/k8s/stack-configmap.yaml | 3 -- .../distributions/k8s/stack-k8s.yaml.template | 7 +++ .../distributions/k8s/stack_run_config.yaml | 3 -- .../distributions/k8s/ui-k8s.yaml.template | 50 ++++++------------- .../distributions/k8s/vllm-k8s.yaml.template | 8 +-- .../k8s/vllm-safety-k8s.yaml.template | 4 +- 9 files changed, 64 insertions(+), 60 deletions(-) diff --git a/docs/source/distributions/eks/apply.sh b/docs/source/distributions/eks/apply.sh index 3ad3dd263..d0b21cf94 100755 --- a/docs/source/distributions/eks/apply.sh +++ b/docs/source/distributions/eks/apply.sh @@ -11,8 +11,8 @@ set -euo pipefail SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" K8S_DIR="${SCRIPT_DIR}/../k8s" -echo "Setting up AWS EKS-specific storage class..." -kubectl apply -f gp3-topology-aware.yaml +# echo "Setting up AWS EKS-specific storage class..." +# kubectl apply -f gp3-topology-aware.yaml echo "Running main Kubernetes deployment..." cd "${K8S_DIR}" diff --git a/docs/source/distributions/eks/gp3-topology-aware.yaml b/docs/source/distributions/eks/gp3-topology-aware.yaml index 1192ba18c..1a22800a2 100644 --- a/docs/source/distributions/eks/gp3-topology-aware.yaml +++ b/docs/source/distributions/eks/gp3-topology-aware.yaml @@ -9,7 +9,7 @@ parameters: type: gp3 iops: "3000" throughput: "125" -provisioner: ebs.csi.aws.com +provisioner: ebs.csi.eks.aws.com reclaimPolicy: Delete volumeBindingMode: WaitForFirstConsumer allowVolumeExpansion: true diff --git a/docs/source/distributions/k8s/apply.sh b/docs/source/distributions/k8s/apply.sh index 3356da53e..11c327821 100755 --- a/docs/source/distributions/k8s/apply.sh +++ b/docs/source/distributions/k8s/apply.sh @@ -13,6 +13,9 @@ export POSTGRES_PASSWORD=llamastack export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B +# Set USE_EBS to false if you don't have permission to use EKS EBS +export USE_EBS=${USE_EBS:-false} + # HF_TOKEN should be set by the user; base64 encode it for the secret if [ -n "${HF_TOKEN:-}" ]; then export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64) @@ -47,17 +50,37 @@ if [ -n "${HF_TOKEN:-}" ]; then envsubst < ./hf-token-secret.yaml.template | kubectl apply -f - fi -envsubst < ./vllm-k8s.yaml.template | kubectl apply -f - -envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f - -envsubst < ./postgres-k8s.yaml.template | kubectl apply -f - -envsubst < ./chroma-k8s.yaml.template | kubectl apply -f - +# Apply templates with appropriate storage configuration based on USE_EBS setting +if [ "$USE_EBS" = "true" ]; then + echo "Using EBS storage for persistent volumes" + envsubst < ./vllm-k8s.yaml.template | kubectl apply -f - + envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f - + envsubst < ./postgres-k8s.yaml.template | kubectl apply -f - + envsubst < ./chroma-k8s.yaml.template | kubectl apply -f - -kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \ - --dry-run=client -o yaml > stack-configmap.yaml + kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \ + --dry-run=client -o yaml > stack-configmap.yaml -kubectl apply -f stack-configmap.yaml + kubectl apply -f stack-configmap.yaml -envsubst < ./stack-k8s.yaml.template | kubectl apply -f - -envsubst < ./ingress-k8s.yaml.template | kubectl apply -f - + envsubst < ./stack-k8s.yaml.template | kubectl apply -f - + envsubst < ./ingress-k8s.yaml.template | kubectl apply -f - + envsubst < ./ui-k8s.yaml.template | kubectl apply -f - +else + echo "Using emptyDir for storage (data will not persist across pod restarts)" + # Process templates to replace EBS storage with emptyDir + envsubst < ./vllm-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - + envsubst < ./vllm-safety-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - + envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - + envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - -envsubst < ./ui-k8s.yaml.template | kubectl apply -f - + kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \ + --dry-run=client -o yaml > stack-configmap.yaml + + kubectl apply -f stack-configmap.yaml + + # Apply the same emptyDir transformation to the remaining templates + envsubst < ./stack-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - + envsubst < ./ingress-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - + envsubst < ./ui-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - +fi diff --git a/docs/source/distributions/k8s/stack-configmap.yaml b/docs/source/distributions/k8s/stack-configmap.yaml index c505cba49..129471862 100644 --- a/docs/source/distributions/k8s/stack-configmap.yaml +++ b/docs/source/distributions/k8s/stack-configmap.yaml @@ -122,9 +122,6 @@ data: provider_id: rag-runtime server: port: 8321 - auth: - provider_config: - type: github_token kind: ConfigMap metadata: creationTimestamp: null diff --git a/docs/source/distributions/k8s/stack-k8s.yaml.template b/docs/source/distributions/k8s/stack-k8s.yaml.template index 912445f68..5cfd00425 100644 --- a/docs/source/distributions/k8s/stack-k8s.yaml.template +++ b/docs/source/distributions/k8s/stack-k8s.yaml.template @@ -25,6 +25,13 @@ spec: app.kubernetes.io/name: llama-stack app.kubernetes.io/component: server spec: + initContainers: + - name: wait-for-vllm-server + image: busybox:1.28 + command: ['sh', '-c', 'until nc -z vllm-server.default.svc.cluster.local 8000; do echo waiting for vllm-server on port 8000; sleep 2; done;'] + - name: wait-for-vllm-server-safety + image: busybox:1.28 + command: ['sh', '-c', 'until nc -z vllm-server-safety.default.svc.cluster.local 8001; do echo waiting for vllm-server-safety on port 8001; sleep 2; done;'] containers: - name: llama-stack image: llamastack/distribution-starter:latest diff --git a/docs/source/distributions/k8s/stack_run_config.yaml b/docs/source/distributions/k8s/stack_run_config.yaml index 4da1bd8b4..23993ca5d 100644 --- a/docs/source/distributions/k8s/stack_run_config.yaml +++ b/docs/source/distributions/k8s/stack_run_config.yaml @@ -119,6 +119,3 @@ tool_groups: provider_id: rag-runtime server: port: 8321 - auth: - provider_config: - type: github_token diff --git a/docs/source/distributions/k8s/ui-k8s.yaml.template b/docs/source/distributions/k8s/ui-k8s.yaml.template index a6859cb86..a83544567 100644 --- a/docs/source/distributions/k8s/ui-k8s.yaml.template +++ b/docs/source/distributions/k8s/ui-k8s.yaml.template @@ -19,50 +19,30 @@ spec: spec: containers: - name: llama-stack-ui - image: node:18-alpine - command: ["/bin/sh"] + image: python:3.12-slim env: - name: LLAMA_STACK_BACKEND_URL value: "http://llama-stack-service:8321" - - name: LLAMA_STACK_UI_PORT - value: "8322" - - name: GITHUB_CLIENT_ID - value: "${GITHUB_CLIENT_ID}" - - name: GITHUB_CLIENT_SECRET - value: "${GITHUB_CLIENT_SECRET}" - - name: NEXTAUTH_URL - value: "${LLAMA_STACK_UI_URL}:8322" + - name: LLAMA_STACK_ENDPOINT + value: "http://llama-stack-service:8321" + workingDir: /app + command: ["/bin/sh"] args: - -c - | - # Install git (not included in alpine by default) - apk add --no-cache git - + # Install pip and git + /usr/local/bin/python -m pip install --upgrade pip + apt-get update && apt-get install -y git # Clone the repository - echo "Cloning repository..." git clone https://github.com/meta-llama/llama-stack.git /app - # Navigate to the UI directory - echo "Navigating to UI directory..." - cd /app/llama_stack/ui + # Navigate to the playground directory + cd /app/llama_stack/distribution/ui - # Check if package.json exists - if [ ! -f "package.json" ]; then - echo "ERROR: package.json not found in $(pwd)" - ls -la - exit 1 - fi + # Install requirements + pip install -r requirements.txt - # Install dependencies with verbose output - echo "Installing dependencies..." - npm install --verbose - - # Verify next is installed - echo "Checking if next is installed..." - npx next --version || echo "Next.js not found, checking node_modules..." - ls -la node_modules/.bin/ | grep next || echo "No next binary found" - - npm run dev + # Run the Streamlit app + streamlit run app.py --server.port=8322 --server.address=0.0.0.0 ports: - - containerPort: 8322 - workingDir: /app + - containerPort: 8501 diff --git a/docs/source/distributions/k8s/vllm-k8s.yaml.template b/docs/source/distributions/k8s/vllm-k8s.yaml.template index 22bee4bbc..efbdcfdde 100644 --- a/docs/source/distributions/k8s/vllm-k8s.yaml.template +++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template @@ -25,14 +25,16 @@ spec: app.kubernetes.io/name: vllm workload-type: inference spec: - nodeSelector: - eks.amazonaws.com/nodegroup: gpu + # Removed nodeSelector for GPU nodes as they don't appear to exist in the cluster + # If you have GPU nodes with a different label, you can uncomment and modify this section + # nodeSelector: + # : containers: - name: vllm image: vllm/vllm-openai:latest command: ["/bin/sh", "-c"] args: - - "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.6 --enable-auto-tool-choice --tool-call-parser llama4_pythonic" + - "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.7 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 4" env: - name: INFERENCE_MODEL value: "${INFERENCE_MODEL}" diff --git a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template index 37b2b9a6b..29542b9e5 100644 --- a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template +++ b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template @@ -25,14 +25,12 @@ spec: app.kubernetes.io/name: vllm-safety workload-type: inference spec: - nodeSelector: - eks.amazonaws.com/nodegroup: gpu containers: - name: vllm-safety image: vllm/vllm-openai:latest command: ["/bin/sh", "-c"] args: [ - "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.3" + "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.6" ] env: - name: SAFETY_MODEL