second try

This commit is contained in:
Kai Wu 2025-07-30 14:51:43 -07:00
parent 31a15332c4
commit 1cb9d3bca2
11 changed files with 237 additions and 64 deletions

View file

@ -6,12 +6,18 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
# Check if NGC_API_KEY is provided as argument
if [ -n "$1" ]; then
export NGC_API_KEY=$1
echo "Using NGC API key provided as argument."
fi
export POSTGRES_USER=llamastack
export POSTGRES_DB=llamastack
export POSTGRES_PASSWORD=llamastack
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
export CODE_MODEL=bigcode/starcoder2-7b
# Set USE_EBS to false if you don't have permission to use EKS EBS
export USE_EBS=${USE_EBS:-false}
@ -24,13 +30,16 @@ else
exit 1
fi
if [ -z "${GITHUB_CLIENT_ID:-}" ]; then
echo "ERROR: GITHUB_CLIENT_ID not set. You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
exit 1
fi
if [ -z "${GITHUB_CLIENT_SECRET:-}" ]; then
echo "ERROR: GITHUB_CLIENT_SECRET not set. You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide"
# NGC_API_KEY should be set by the user or provided as argument; base64 encode it for the secret
if [ -n "${NGC_API_KEY:-}" ]; then
export NGC_API_KEY_BASE64=$(echo -n "$NGC_API_KEY" | base64)
# Create Docker config JSON for NGC image pull
NGC_DOCKER_CONFIG="{\"auths\":{\"nvcr.io\":{\"username\":\"\$oauthtoken\",\"password\":\"$NGC_API_KEY\"}}}"
export NGC_DOCKER_CONFIG_JSON=$(echo -n "$NGC_DOCKER_CONFIG" | base64)
else
echo "ERROR: NGC_API_KEY not set. You need it for NIM to download models from NVIDIA."
echo "Usage: $0 [your-ngc-api-key]"
echo "You can either provide your NGC API key as an argument or set it as an environment variable."
exit 1
fi
@ -41,20 +50,18 @@ fi
# Apply the HF token secret if HF_TOKEN is provided
if [ -n "${HF_TOKEN:-}" ]; then
envsubst < ./set-secret.yaml.template | kubectl apply -f -
fi
set -euo pipefail
set -x
# Apply the HF token secret if HF_TOKEN is provided
if [ -n "${HF_TOKEN:-}" ]; then
envsubst < ./hf-token-secret.yaml.template | kubectl apply -f -
fi
# Apply templates with appropriate storage configuration based on USE_EBS setting
if [ "$USE_EBS" = "true" ]; then
echo "Using EBS storage for persistent volumes"
envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
envsubst < ./llama-nim.yaml.template | kubectl apply -f -
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
@ -70,7 +77,7 @@ else
echo "Using emptyDir for storage (data will not persist across pod restarts)"
# Process templates to replace EBS storage with emptyDir
envsubst < ./vllm-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
envsubst < ./vllm-safety-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
envsubst < ./llama-nim.yaml.template | kubectl apply -f -
envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -

14
docs/source/distributions/k8s/delete.sh Normal file → Executable file
View file

@ -21,6 +21,14 @@ if [ -n "${HF_TOKEN:-}" ]; then
export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64)
fi
# NGC_API_KEY should be set by the user; base64 encode it for the secret
if [ -n "${NGC_API_KEY:-}" ]; then
export NGC_API_KEY_BASE64=$(echo -n "$NGC_API_KEY" | base64)
# Create Docker config JSON for NGC image pull
NGC_DOCKER_CONFIG="{\"auths\":{\"nvcr.io\":{\"username\":\"\$oauthtoken\",\"password\":\"$NGC_API_KEY\"}}}"
export NGC_DOCKER_CONFIG_JSON=$(echo -n "$NGC_DOCKER_CONFIG" | base64)
fi
set -euo pipefail
set -x
@ -45,14 +53,16 @@ envsubst < ./chroma-k8s.yaml.template | kubectl delete -f - --ignore-not-found=t
envsubst < ./postgres-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
# Delete vllm-safety deployment
envsubst < ./vllm-safety-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
envsubst < ./llama-nim.yaml.template | kubectl delete -f - --ignore-not-found=true
# Delete vllm deployment
envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
# Delete the HF token secret if it exists
if [ -n "${HF_TOKEN:-}" ]; then
envsubst < ./hf-token-secret.yaml.template | kubectl delete -f - --ignore-not-found=true
envsubst < ./set-secret.yaml.template | kubectl delete -f - --ignore-not-found=true
fi
# NGC API key secrets are now part of llama-nim.yaml.template and are deleted with it
echo "All LlamaStack Kubernetes resources have been deleted."

View file

@ -1,7 +0,0 @@
apiVersion: v1
kind: Secret
metadata:
name: hf-token-secret
type: Opaque
data:
token: ${HF_TOKEN_BASE64}

View file

@ -1,25 +1,25 @@
# -------------------------------------------------
# NVIDIA NIM — Llama-3 1 Nemotron-Nano-4B-v1 1
# NVIDIA NIM - Code
# -------------------------------------------------
apiVersion: apps/v1
kind: Deployment
metadata:
name: llama-nano-nim
name: llm-nim-code
labels:
app: llama-nano-nim
app: llm-nim-code
spec:
replicas: 1
selector:
matchLabels:
app: llama-nano-nim
app: llm-nim-code
template:
metadata:
labels:
app: llama-nano-nim
app: llm-nim-code
spec:
imagePullSecrets:
- name: ngc-secret # docker-registry secret: nvcr.io / $oauthtoken / <NGC_DOCKER_API_KEY>
- name: ngc-docker-registry # docker-registry secret: nvcr.io / $oauthtoken / <NGC_DOCKER_API_KEY>
volumes:
- name: model-cache
emptyDir:
@ -27,7 +27,7 @@ spec:
sizeLimit: 12Gi # fits the 4 B model + tensors; adjust if needed
containers:
- name: nim
image: nvcr.io/nim/meta/llama-3_1-nemotron-nano-4b-v1_1:1.0.0
image: nvcr.io/nim/bigcode/starcoder2-7b:1.8.1
ports:
- name: http-openai
containerPort: 8000
@ -36,7 +36,7 @@ spec:
nvidia.com/gpu: 1
env:
- name: NIM_MODEL_NAME
value: "nvidia/llama-3_1-nemotron-nano-4b-v1_1"
value: "nvidia/starcoder2-7b"
- name: NGC_API_KEY
valueFrom:
secretKeyRef:
@ -49,23 +49,23 @@ spec:
httpGet:
path: /v1/models
port: http-openai
initialDelaySeconds: 20
periodSeconds: 10
initialDelaySeconds: 360
periodSeconds: 360
livenessProbe:
httpGet:
path: /v1/health
port: http-openai
initialDelaySeconds: 60
periodSeconds: 30
initialDelaySeconds: 600
periodSeconds: 360
---
apiVersion: v1
kind: Service
metadata:
name: llama-nano-nim
name: llm-nim-code
spec:
selector:
app: llama-nano-nim
app: llm-nim-code
ports:
- name: http-openai
port: 8000

View file

@ -0,0 +1,31 @@
apiVersion: v1
kind: Secret
metadata:
name: hf-token-secret
type: Opaque
data:
token: ${HF_TOKEN_BASE64}
# -------------------------------------------------
# NGC Docker Registry Secret
# -------------------------------------------------
apiVersion: apps/v1
kind: Secret
metadata:
name: ngc-docker-registry
type: kubernetes.io/dockerconfigjson
data:
.dockerconfigjson: ${NGC_DOCKER_CONFIG_JSON}
# -------------------------------------------------
# NGC API Secret
# -------------------------------------------------
apiVersion: apps/v1
kind: Secret
metadata:
name: ngc-api
type: Opaque
data:
NGC_API_KEY: ${NGC_API_KEY_BASE64}

View file

@ -0,0 +1,125 @@
apiVersion: v1
data:
stack_run_config.yaml: |
version: '2'
image_name: kubernetes-demo
apis:
- agents
- inference
- safety
- telemetry
- tool_runtime
- vector_io
providers:
inference:
- provider_id: vllm-inference
provider_type: remote::vllm
config:
url: ${env.VLLM_URL:=http://localhost:8000/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: nvidia
provider_type: remote::nvidia
config:
url: ${env.NVIDIA_BASE_URL:=http://localhost:8001/v1}
api_key: ${env.NVIDIA_API_KEY:=}
append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
vector_io:
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
provider_type: remote::chromadb
config:
url: ${env.CHROMADB_URL:=}
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config:
excluded_categories: []
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
responses_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
sinks: ${env.TELEMETRY_SINKS:=console}
tool_runtime:
- provider_id: brave-search
provider_type: remote::brave-search
config:
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
max_results: 3
- provider_id: tavily-search
provider_type: remote::tavily-search
config:
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
max_results: 3
- provider_id: rag-runtime
provider_type: inline::rag-runtime
config: {}
- provider_id: model-context-protocol
provider_type: remote::model-context-protocol
config: {}
metadata_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
table_name: llamastack_kvstore
inference_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
models:
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
model_type: embedding
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference
model_type: llm
- metadata: {}
model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b}
provider_id: nvidia
model_type: llm
vector_dbs: []
datasets: []
scoring_fns: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
- toolgroup_id: builtin::rag
provider_id: rag-runtime
server:
port: 8321
kind: ConfigMap
metadata:
creationTimestamp: null
name: llama-stack-config

View file

@ -28,27 +28,34 @@ spec:
initContainers:
- name: wait-for-vllm-server
image: busybox:1.28
command: ['sh', '-c', 'until nc -z vllm-server.default.svc.cluster.local 8000; do echo waiting for vllm-server on port 8000; sleep 2; done;']
- name: wait-for-vllm-server-safety
command: ['sh', '-c', 'until nc -z vllm-server.default.svc.cluster.local 8001; do echo waiting for vllm-server on port 8000; sleep 2; done;']
- name: wait-for-llm-nim-code
image: busybox:1.28
command: ['sh', '-c', 'until nc -z vllm-server-safety.default.svc.cluster.local 8001; do echo waiting for vllm-server-safety on port 8001; sleep 2; done;']
command: ['sh', '-c', 'until nc -z llm-nim-code.default.svc.cluster.local 8000; do echo waiting for llm-nim-code on port 8001; sleep 2; done;']
containers:
- name: llama-stack
image: llamastack/distribution-starter:latest
imagePullPolicy: Always # since we have specified latest instead of a version
resources:
requests:
memory: "512Mi"
cpu: "500m"
ephemeral-storage: "2Gi"
limits:
memory: "1Gi"
cpu: "1000m"
ephemeral-storage: "5Gi"
env:
- name: ENABLE_CHROMADB
value: "true"
- name: CHROMADB_URL
value: http://chromadb.default.svc.cluster.local:6000
- name: VLLM_URL
value: http://vllm-server.default.svc.cluster.local:8000/v1
value: http://vllm-server.default.svc.cluster.local:8001/v1
- name: VLLM_MAX_TOKENS
value: "3072"
- name: NVIDIA_BASE_URL
value: http://llama-nano-nim.default.svc.cluster.local:8000/v1
- name: VLLM_SAFETY_URL
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
value: http://llm-nim-code.default.svc.cluster.local:8000/v1
- name: POSTGRES_HOST
value: postgres-server.default.svc.cluster.local
- name: POSTGRES_PORT
@ -57,8 +64,8 @@ spec:
value: "false"
- name: INFERENCE_MODEL
value: "${INFERENCE_MODEL}"
- name: SAFETY_MODEL
value: "${SAFETY_MODEL}"
- name: CODE_MODEL
value: "${CODE_MODEL}"
- name: TAVILY_SEARCH_API_KEY
value: "${TAVILY_SEARCH_API_KEY}"
command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"]

View file

@ -16,13 +16,12 @@ providers:
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: vllm-safety
provider_type: remote::vllm
- provider_id: nvidia
provider_type: remote::nvidia
config:
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
url: ${env.NVIDIA_BASE_URL:=http://localhost:8001/v1}
api_key: ${env.NVIDIA_API_KEY:=}
append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
@ -103,11 +102,9 @@ models:
provider_id: vllm-inference
model_type: llm
- metadata: {}
model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
provider_id: vllm-safety
model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b}
provider_id: nvidia
model_type: llm
shields:
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
vector_dbs: []
datasets: []
scoring_fns: []

View file

@ -34,7 +34,7 @@ spec:
image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"]
args:
- "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.7 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 4"
- "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.7 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 4 --port 8001"
env:
- name: INFERENCE_MODEL
value: "${INFERENCE_MODEL}"
@ -44,7 +44,7 @@ spec:
name: hf-token-secret
key: token
ports:
- containerPort: 8000
- containerPort: 8001
resources:
limits:
nvidia.com/gpu: 1
@ -67,6 +67,6 @@ spec:
app.kubernetes.io/name: vllm
ports:
- protocol: TCP
port: 8000
targetPort: 8000
port: 8001
targetPort: 8001
type: ClusterIP