mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-29 15:23:51 +00:00
first draft
This commit is contained in:
parent
025163d8e6
commit
e614241876
9 changed files with 64 additions and 60 deletions
|
@ -11,8 +11,8 @@ set -euo pipefail
|
||||||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
|
||||||
K8S_DIR="${SCRIPT_DIR}/../k8s"
|
K8S_DIR="${SCRIPT_DIR}/../k8s"
|
||||||
|
|
||||||
echo "Setting up AWS EKS-specific storage class..."
|
# echo "Setting up AWS EKS-specific storage class..."
|
||||||
kubectl apply -f gp3-topology-aware.yaml
|
# kubectl apply -f gp3-topology-aware.yaml
|
||||||
|
|
||||||
echo "Running main Kubernetes deployment..."
|
echo "Running main Kubernetes deployment..."
|
||||||
cd "${K8S_DIR}"
|
cd "${K8S_DIR}"
|
||||||
|
|
|
@ -9,7 +9,7 @@ parameters:
|
||||||
type: gp3
|
type: gp3
|
||||||
iops: "3000"
|
iops: "3000"
|
||||||
throughput: "125"
|
throughput: "125"
|
||||||
provisioner: ebs.csi.aws.com
|
provisioner: ebs.csi.eks.aws.com
|
||||||
reclaimPolicy: Delete
|
reclaimPolicy: Delete
|
||||||
volumeBindingMode: WaitForFirstConsumer
|
volumeBindingMode: WaitForFirstConsumer
|
||||||
allowVolumeExpansion: true
|
allowVolumeExpansion: true
|
||||||
|
|
|
@ -13,6 +13,9 @@ export POSTGRES_PASSWORD=llamastack
|
||||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||||
|
|
||||||
|
# Set USE_EBS to false if you don't have permission to use EKS EBS
|
||||||
|
export USE_EBS=${USE_EBS:-false}
|
||||||
|
|
||||||
# HF_TOKEN should be set by the user; base64 encode it for the secret
|
# HF_TOKEN should be set by the user; base64 encode it for the secret
|
||||||
if [ -n "${HF_TOKEN:-}" ]; then
|
if [ -n "${HF_TOKEN:-}" ]; then
|
||||||
export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64)
|
export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64)
|
||||||
|
@ -47,17 +50,37 @@ if [ -n "${HF_TOKEN:-}" ]; then
|
||||||
envsubst < ./hf-token-secret.yaml.template | kubectl apply -f -
|
envsubst < ./hf-token-secret.yaml.template | kubectl apply -f -
|
||||||
fi
|
fi
|
||||||
|
|
||||||
envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
|
# Apply templates with appropriate storage configuration based on USE_EBS setting
|
||||||
envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
|
if [ "$USE_EBS" = "true" ]; then
|
||||||
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
|
echo "Using EBS storage for persistent volumes"
|
||||||
envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
|
envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
|
||||||
|
envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
|
||||||
|
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
|
||||||
|
envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
|
||||||
|
|
||||||
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
|
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
|
||||||
--dry-run=client -o yaml > stack-configmap.yaml
|
--dry-run=client -o yaml > stack-configmap.yaml
|
||||||
|
|
||||||
kubectl apply -f stack-configmap.yaml
|
kubectl apply -f stack-configmap.yaml
|
||||||
|
|
||||||
envsubst < ./stack-k8s.yaml.template | kubectl apply -f -
|
envsubst < ./stack-k8s.yaml.template | kubectl apply -f -
|
||||||
envsubst < ./ingress-k8s.yaml.template | kubectl apply -f -
|
envsubst < ./ingress-k8s.yaml.template | kubectl apply -f -
|
||||||
|
envsubst < ./ui-k8s.yaml.template | kubectl apply -f -
|
||||||
|
else
|
||||||
|
echo "Using emptyDir for storage (data will not persist across pod restarts)"
|
||||||
|
# Process templates to replace EBS storage with emptyDir
|
||||||
|
envsubst < ./vllm-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
||||||
|
envsubst < ./vllm-safety-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
||||||
|
envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
||||||
|
envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
||||||
|
|
||||||
envsubst < ./ui-k8s.yaml.template | kubectl apply -f -
|
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
|
||||||
|
--dry-run=client -o yaml > stack-configmap.yaml
|
||||||
|
|
||||||
|
kubectl apply -f stack-configmap.yaml
|
||||||
|
|
||||||
|
# Apply the same emptyDir transformation to the remaining templates
|
||||||
|
envsubst < ./stack-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
||||||
|
envsubst < ./ingress-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
||||||
|
envsubst < ./ui-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
||||||
|
fi
|
||||||
|
|
|
@ -122,9 +122,6 @@ data:
|
||||||
provider_id: rag-runtime
|
provider_id: rag-runtime
|
||||||
server:
|
server:
|
||||||
port: 8321
|
port: 8321
|
||||||
auth:
|
|
||||||
provider_config:
|
|
||||||
type: github_token
|
|
||||||
kind: ConfigMap
|
kind: ConfigMap
|
||||||
metadata:
|
metadata:
|
||||||
creationTimestamp: null
|
creationTimestamp: null
|
||||||
|
|
|
@ -25,6 +25,13 @@ spec:
|
||||||
app.kubernetes.io/name: llama-stack
|
app.kubernetes.io/name: llama-stack
|
||||||
app.kubernetes.io/component: server
|
app.kubernetes.io/component: server
|
||||||
spec:
|
spec:
|
||||||
|
initContainers:
|
||||||
|
- name: wait-for-vllm-server
|
||||||
|
image: busybox:1.28
|
||||||
|
command: ['sh', '-c', 'until nc -z vllm-server.default.svc.cluster.local 8000; do echo waiting for vllm-server on port 8000; sleep 2; done;']
|
||||||
|
- name: wait-for-vllm-server-safety
|
||||||
|
image: busybox:1.28
|
||||||
|
command: ['sh', '-c', 'until nc -z vllm-server-safety.default.svc.cluster.local 8001; do echo waiting for vllm-server-safety on port 8001; sleep 2; done;']
|
||||||
containers:
|
containers:
|
||||||
- name: llama-stack
|
- name: llama-stack
|
||||||
image: llamastack/distribution-starter:latest
|
image: llamastack/distribution-starter:latest
|
||||||
|
|
|
@ -119,6 +119,3 @@ tool_groups:
|
||||||
provider_id: rag-runtime
|
provider_id: rag-runtime
|
||||||
server:
|
server:
|
||||||
port: 8321
|
port: 8321
|
||||||
auth:
|
|
||||||
provider_config:
|
|
||||||
type: github_token
|
|
||||||
|
|
|
@ -19,50 +19,30 @@ spec:
|
||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- name: llama-stack-ui
|
- name: llama-stack-ui
|
||||||
image: node:18-alpine
|
image: python:3.12-slim
|
||||||
command: ["/bin/sh"]
|
|
||||||
env:
|
env:
|
||||||
- name: LLAMA_STACK_BACKEND_URL
|
- name: LLAMA_STACK_BACKEND_URL
|
||||||
value: "http://llama-stack-service:8321"
|
value: "http://llama-stack-service:8321"
|
||||||
- name: LLAMA_STACK_UI_PORT
|
- name: LLAMA_STACK_ENDPOINT
|
||||||
value: "8322"
|
value: "http://llama-stack-service:8321"
|
||||||
- name: GITHUB_CLIENT_ID
|
workingDir: /app
|
||||||
value: "${GITHUB_CLIENT_ID}"
|
command: ["/bin/sh"]
|
||||||
- name: GITHUB_CLIENT_SECRET
|
|
||||||
value: "${GITHUB_CLIENT_SECRET}"
|
|
||||||
- name: NEXTAUTH_URL
|
|
||||||
value: "${LLAMA_STACK_UI_URL}:8322"
|
|
||||||
args:
|
args:
|
||||||
- -c
|
- -c
|
||||||
- |
|
- |
|
||||||
# Install git (not included in alpine by default)
|
# Install pip and git
|
||||||
apk add --no-cache git
|
/usr/local/bin/python -m pip install --upgrade pip
|
||||||
|
apt-get update && apt-get install -y git
|
||||||
# Clone the repository
|
# Clone the repository
|
||||||
echo "Cloning repository..."
|
|
||||||
git clone https://github.com/meta-llama/llama-stack.git /app
|
git clone https://github.com/meta-llama/llama-stack.git /app
|
||||||
|
|
||||||
# Navigate to the UI directory
|
# Navigate to the playground directory
|
||||||
echo "Navigating to UI directory..."
|
cd /app/llama_stack/distribution/ui
|
||||||
cd /app/llama_stack/ui
|
|
||||||
|
|
||||||
# Check if package.json exists
|
# Install requirements
|
||||||
if [ ! -f "package.json" ]; then
|
pip install -r requirements.txt
|
||||||
echo "ERROR: package.json not found in $(pwd)"
|
|
||||||
ls -la
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# Install dependencies with verbose output
|
# Run the Streamlit app
|
||||||
echo "Installing dependencies..."
|
streamlit run app.py --server.port=8322 --server.address=0.0.0.0
|
||||||
npm install --verbose
|
|
||||||
|
|
||||||
# Verify next is installed
|
|
||||||
echo "Checking if next is installed..."
|
|
||||||
npx next --version || echo "Next.js not found, checking node_modules..."
|
|
||||||
ls -la node_modules/.bin/ | grep next || echo "No next binary found"
|
|
||||||
|
|
||||||
npm run dev
|
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 8322
|
- containerPort: 8501
|
||||||
workingDir: /app
|
|
||||||
|
|
|
@ -25,14 +25,16 @@ spec:
|
||||||
app.kubernetes.io/name: vllm
|
app.kubernetes.io/name: vllm
|
||||||
workload-type: inference
|
workload-type: inference
|
||||||
spec:
|
spec:
|
||||||
nodeSelector:
|
# Removed nodeSelector for GPU nodes as they don't appear to exist in the cluster
|
||||||
eks.amazonaws.com/nodegroup: gpu
|
# If you have GPU nodes with a different label, you can uncomment and modify this section
|
||||||
|
# nodeSelector:
|
||||||
|
# <your-gpu-node-label-key>: <your-gpu-node-label-value>
|
||||||
containers:
|
containers:
|
||||||
- name: vllm
|
- name: vllm
|
||||||
image: vllm/vllm-openai:latest
|
image: vllm/vllm-openai:latest
|
||||||
command: ["/bin/sh", "-c"]
|
command: ["/bin/sh", "-c"]
|
||||||
args:
|
args:
|
||||||
- "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.6 --enable-auto-tool-choice --tool-call-parser llama4_pythonic"
|
- "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.7 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 4"
|
||||||
env:
|
env:
|
||||||
- name: INFERENCE_MODEL
|
- name: INFERENCE_MODEL
|
||||||
value: "${INFERENCE_MODEL}"
|
value: "${INFERENCE_MODEL}"
|
||||||
|
|
|
@ -25,14 +25,12 @@ spec:
|
||||||
app.kubernetes.io/name: vllm-safety
|
app.kubernetes.io/name: vllm-safety
|
||||||
workload-type: inference
|
workload-type: inference
|
||||||
spec:
|
spec:
|
||||||
nodeSelector:
|
|
||||||
eks.amazonaws.com/nodegroup: gpu
|
|
||||||
containers:
|
containers:
|
||||||
- name: vllm-safety
|
- name: vllm-safety
|
||||||
image: vllm/vllm-openai:latest
|
image: vllm/vllm-openai:latest
|
||||||
command: ["/bin/sh", "-c"]
|
command: ["/bin/sh", "-c"]
|
||||||
args: [
|
args: [
|
||||||
"vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.3"
|
"vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.6"
|
||||||
]
|
]
|
||||||
env:
|
env:
|
||||||
- name: SAFETY_MODEL
|
- name: SAFETY_MODEL
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue