add NIM k8s solution

This commit is contained in:
Kai Wu 2025-07-29 09:01:21 -07:00
parent 95d25ddfe2
commit 8c0f328cbc
4 changed files with 133 additions and 128 deletions

View file

@ -0,0 +1,58 @@
#!/usr/bin/env bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
export POSTGRES_USER=llamastack
export POSTGRES_DB=llamastack
export POSTGRES_PASSWORD=llamastack
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
# Set USE_EBS to false if you don't have permission to use EKS EBS
export USE_EBS=${USE_EBS:-false}
# HF_TOKEN should be set by the user; base64 encode it for the secret
if [ -n "${HF_TOKEN:-}" ]; then
export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64)
fi
set -euo pipefail
set -x
# Delete resources in reverse order of creation to handle dependencies properly
# Delete UI deployment
envsubst < ./ui-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
# Delete ingress
envsubst < ./ingress-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
# Delete stack deployment
envsubst < ./stack-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
# Delete configmap
kubectl delete configmap llama-stack-config --ignore-not-found=true
# Delete chroma deployment
envsubst < ./chroma-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
# Delete postgres deployment
envsubst < ./postgres-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
# Delete vllm-safety deployment
envsubst < ./vllm-safety-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
# Delete vllm deployment
envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
# Delete the HF token secret if it exists
if [ -n "${HF_TOKEN:-}" ]; then
envsubst < ./hf-token-secret.yaml.template | kubectl delete -f - --ignore-not-found=true
fi
echo "All LlamaStack Kubernetes resources have been deleted."

View file

@ -0,0 +1,73 @@
# -------------------------------------------------
# NVIDIA NIM — Llama-3 1 Nemotron-Nano-4B-v1 1
# -------------------------------------------------
apiVersion: apps/v1
kind: Deployment
metadata:
name: llama-nano-nim
labels:
app: llama-nano-nim
spec:
replicas: 1
selector:
matchLabels:
app: llama-nano-nim
template:
metadata:
labels:
app: llama-nano-nim
spec:
imagePullSecrets:
- name: ngc-secret # docker-registry secret: nvcr.io / $oauthtoken / <NGC_DOCKER_API_KEY>
volumes:
- name: model-cache
emptyDir:
medium: Memory # tmpfs; omit or use "" to back by node disk
sizeLimit: 12Gi # fits the 4 B model + tensors; adjust if needed
containers:
- name: nim
image: nvcr.io/nim/meta/llama-3_1-nemotron-nano-4b-v1_1:1.0.0
ports:
- name: http-openai
containerPort: 8000
resources:
limits:
nvidia.com/gpu: 1
env:
- name: NIM_MODEL_NAME
value: "nvidia/llama-3_1-nemotron-nano-4b-v1_1"
- name: NGC_API_KEY
valueFrom:
secretKeyRef:
name: ngc-api
key: NGC_API_KEY
volumeMounts:
- name: model-cache
mountPath: /models # default NIM cache path
readinessProbe:
httpGet:
path: /v1/models
port: http-openai
initialDelaySeconds: 20
periodSeconds: 10
livenessProbe:
httpGet:
path: /v1/health
port: http-openai
initialDelaySeconds: 60
periodSeconds: 30
---
apiVersion: v1
kind: Service
metadata:
name: llama-nano-nim
spec:
selector:
app: llama-nano-nim
ports:
- name: http-openai
port: 8000
targetPort: 8000
type: ClusterIP

View file

@ -1,128 +0,0 @@
apiVersion: v1
data:
stack_run_config.yaml: |
version: '2'
image_name: kubernetes-demo
apis:
- agents
- inference
- safety
- telemetry
- tool_runtime
- vector_io
providers:
inference:
- provider_id: vllm-inference
provider_type: remote::vllm
config:
url: ${env.VLLM_URL:=http://localhost:8000/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: vllm-safety
provider_type: remote::vllm
config:
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
vector_io:
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
provider_type: remote::chromadb
config:
url: ${env.CHROMADB_URL:=}
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config:
excluded_categories: []
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
responses_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
sinks: ${env.TELEMETRY_SINKS:=console}
tool_runtime:
- provider_id: brave-search
provider_type: remote::brave-search
config:
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
max_results: 3
- provider_id: tavily-search
provider_type: remote::tavily-search
config:
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
max_results: 3
- provider_id: rag-runtime
provider_type: inline::rag-runtime
config: {}
- provider_id: model-context-protocol
provider_type: remote::model-context-protocol
config: {}
metadata_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
table_name: llamastack_kvstore
inference_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
models:
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
model_type: embedding
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference
model_type: llm
- metadata: {}
model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
provider_id: vllm-safety
model_type: llm
shields:
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
vector_dbs: []
datasets: []
scoring_fns: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
- toolgroup_id: builtin::rag
provider_id: rag-runtime
server:
port: 8321
kind: ConfigMap
metadata:
creationTimestamp: null
name: llama-stack-config

View file

@ -45,6 +45,8 @@ spec:
value: http://vllm-server.default.svc.cluster.local:8000/v1
- name: VLLM_MAX_TOKENS
value: "3072"
- name: NVIDIA_BASE_URL
value: http://llama-nano-nim.default.svc.cluster.local:8000/v1
- name: VLLM_SAFETY_URL
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
- name: POSTGRES_HOST