mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-01 00:05:18 +00:00
add NIM k8s solution
This commit is contained in:
parent
95d25ddfe2
commit
8c0f328cbc
4 changed files with 133 additions and 128 deletions
58
docs/source/distributions/k8s/delete.sh
Normal file
58
docs/source/distributions/k8s/delete.sh
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||||
|
# All rights reserved.
|
||||||
|
#
|
||||||
|
# This source code is licensed under the terms described in the LICENSE file in
|
||||||
|
# the root directory of this source tree.
|
||||||
|
|
||||||
|
export POSTGRES_USER=llamastack
|
||||||
|
export POSTGRES_DB=llamastack
|
||||||
|
export POSTGRES_PASSWORD=llamastack
|
||||||
|
|
||||||
|
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||||
|
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
|
||||||
|
|
||||||
|
# Set USE_EBS to false if you don't have permission to use EKS EBS
|
||||||
|
export USE_EBS=${USE_EBS:-false}
|
||||||
|
|
||||||
|
# HF_TOKEN should be set by the user; base64 encode it for the secret
|
||||||
|
if [ -n "${HF_TOKEN:-}" ]; then
|
||||||
|
export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64)
|
||||||
|
fi
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
set -x
|
||||||
|
|
||||||
|
# Delete resources in reverse order of creation to handle dependencies properly
|
||||||
|
|
||||||
|
# Delete UI deployment
|
||||||
|
envsubst < ./ui-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
|
# Delete ingress
|
||||||
|
envsubst < ./ingress-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
|
# Delete stack deployment
|
||||||
|
envsubst < ./stack-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
|
# Delete configmap
|
||||||
|
kubectl delete configmap llama-stack-config --ignore-not-found=true
|
||||||
|
|
||||||
|
# Delete chroma deployment
|
||||||
|
envsubst < ./chroma-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
|
# Delete postgres deployment
|
||||||
|
envsubst < ./postgres-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
|
# Delete vllm-safety deployment
|
||||||
|
envsubst < ./vllm-safety-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
|
# Delete vllm deployment
|
||||||
|
envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
|
# Delete the HF token secret if it exists
|
||||||
|
if [ -n "${HF_TOKEN:-}" ]; then
|
||||||
|
envsubst < ./hf-token-secret.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "All LlamaStack Kubernetes resources have been deleted."
|
73
docs/source/distributions/k8s/llama-nim.yaml.template
Normal file
73
docs/source/distributions/k8s/llama-nim.yaml.template
Normal file
|
@ -0,0 +1,73 @@
|
||||||
|
# -------------------------------------------------
|
||||||
|
# NVIDIA NIM — Llama-3 1 Nemotron-Nano-4B-v1 1
|
||||||
|
# -------------------------------------------------
|
||||||
|
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: llama-nano-nim
|
||||||
|
labels:
|
||||||
|
app: llama-nano-nim
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: llama-nano-nim
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: llama-nano-nim
|
||||||
|
spec:
|
||||||
|
imagePullSecrets:
|
||||||
|
- name: ngc-secret # docker-registry secret: nvcr.io / $oauthtoken / <NGC_DOCKER_API_KEY>
|
||||||
|
volumes:
|
||||||
|
- name: model-cache
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory # tmpfs; omit or use "" to back by node disk
|
||||||
|
sizeLimit: 12Gi # fits the 4 B model + tensors; adjust if needed
|
||||||
|
containers:
|
||||||
|
- name: nim
|
||||||
|
image: nvcr.io/nim/meta/llama-3_1-nemotron-nano-4b-v1_1:1.0.0
|
||||||
|
ports:
|
||||||
|
- name: http-openai
|
||||||
|
containerPort: 8000
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 1
|
||||||
|
env:
|
||||||
|
- name: NIM_MODEL_NAME
|
||||||
|
value: "nvidia/llama-3_1-nemotron-nano-4b-v1_1"
|
||||||
|
- name: NGC_API_KEY
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: ngc-api
|
||||||
|
key: NGC_API_KEY
|
||||||
|
volumeMounts:
|
||||||
|
- name: model-cache
|
||||||
|
mountPath: /models # default NIM cache path
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /v1/models
|
||||||
|
port: http-openai
|
||||||
|
initialDelaySeconds: 20
|
||||||
|
periodSeconds: 10
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /v1/health
|
||||||
|
port: http-openai
|
||||||
|
initialDelaySeconds: 60
|
||||||
|
periodSeconds: 30
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: llama-nano-nim
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
app: llama-nano-nim
|
||||||
|
ports:
|
||||||
|
- name: http-openai
|
||||||
|
port: 8000
|
||||||
|
targetPort: 8000
|
||||||
|
type: ClusterIP
|
|
@ -1,128 +0,0 @@
|
||||||
apiVersion: v1
|
|
||||||
data:
|
|
||||||
stack_run_config.yaml: |
|
|
||||||
version: '2'
|
|
||||||
image_name: kubernetes-demo
|
|
||||||
apis:
|
|
||||||
- agents
|
|
||||||
- inference
|
|
||||||
- safety
|
|
||||||
- telemetry
|
|
||||||
- tool_runtime
|
|
||||||
- vector_io
|
|
||||||
providers:
|
|
||||||
inference:
|
|
||||||
- provider_id: vllm-inference
|
|
||||||
provider_type: remote::vllm
|
|
||||||
config:
|
|
||||||
url: ${env.VLLM_URL:=http://localhost:8000/v1}
|
|
||||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
|
||||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
|
||||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
|
||||||
- provider_id: vllm-safety
|
|
||||||
provider_type: remote::vllm
|
|
||||||
config:
|
|
||||||
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
|
|
||||||
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
|
|
||||||
api_token: ${env.VLLM_API_TOKEN:=fake}
|
|
||||||
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
|
|
||||||
- provider_id: sentence-transformers
|
|
||||||
provider_type: inline::sentence-transformers
|
|
||||||
config: {}
|
|
||||||
vector_io:
|
|
||||||
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
|
|
||||||
provider_type: remote::chromadb
|
|
||||||
config:
|
|
||||||
url: ${env.CHROMADB_URL:=}
|
|
||||||
safety:
|
|
||||||
- provider_id: llama-guard
|
|
||||||
provider_type: inline::llama-guard
|
|
||||||
config:
|
|
||||||
excluded_categories: []
|
|
||||||
agents:
|
|
||||||
- provider_id: meta-reference
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config:
|
|
||||||
persistence_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
responses_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
telemetry:
|
|
||||||
- provider_id: meta-reference
|
|
||||||
provider_type: inline::meta-reference
|
|
||||||
config:
|
|
||||||
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
|
|
||||||
sinks: ${env.TELEMETRY_SINKS:=console}
|
|
||||||
tool_runtime:
|
|
||||||
- provider_id: brave-search
|
|
||||||
provider_type: remote::brave-search
|
|
||||||
config:
|
|
||||||
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
|
|
||||||
max_results: 3
|
|
||||||
- provider_id: tavily-search
|
|
||||||
provider_type: remote::tavily-search
|
|
||||||
config:
|
|
||||||
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
|
|
||||||
max_results: 3
|
|
||||||
- provider_id: rag-runtime
|
|
||||||
provider_type: inline::rag-runtime
|
|
||||||
config: {}
|
|
||||||
- provider_id: model-context-protocol
|
|
||||||
provider_type: remote::model-context-protocol
|
|
||||||
config: {}
|
|
||||||
metadata_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
table_name: llamastack_kvstore
|
|
||||||
inference_store:
|
|
||||||
type: postgres
|
|
||||||
host: ${env.POSTGRES_HOST:=localhost}
|
|
||||||
port: ${env.POSTGRES_PORT:=5432}
|
|
||||||
db: ${env.POSTGRES_DB:=llamastack}
|
|
||||||
user: ${env.POSTGRES_USER:=llamastack}
|
|
||||||
password: ${env.POSTGRES_PASSWORD:=llamastack}
|
|
||||||
models:
|
|
||||||
- metadata:
|
|
||||||
embedding_dimension: 384
|
|
||||||
model_id: all-MiniLM-L6-v2
|
|
||||||
provider_id: sentence-transformers
|
|
||||||
model_type: embedding
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ${env.INFERENCE_MODEL}
|
|
||||||
provider_id: vllm-inference
|
|
||||||
model_type: llm
|
|
||||||
- metadata: {}
|
|
||||||
model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
|
||||||
provider_id: vllm-safety
|
|
||||||
model_type: llm
|
|
||||||
shields:
|
|
||||||
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
|
|
||||||
vector_dbs: []
|
|
||||||
datasets: []
|
|
||||||
scoring_fns: []
|
|
||||||
benchmarks: []
|
|
||||||
tool_groups:
|
|
||||||
- toolgroup_id: builtin::websearch
|
|
||||||
provider_id: tavily-search
|
|
||||||
- toolgroup_id: builtin::rag
|
|
||||||
provider_id: rag-runtime
|
|
||||||
server:
|
|
||||||
port: 8321
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
creationTimestamp: null
|
|
||||||
name: llama-stack-config
|
|
|
@ -45,6 +45,8 @@ spec:
|
||||||
value: http://vllm-server.default.svc.cluster.local:8000/v1
|
value: http://vllm-server.default.svc.cluster.local:8000/v1
|
||||||
- name: VLLM_MAX_TOKENS
|
- name: VLLM_MAX_TOKENS
|
||||||
value: "3072"
|
value: "3072"
|
||||||
|
- name: NVIDIA_BASE_URL
|
||||||
|
value: http://llama-nano-nim.default.svc.cluster.local:8000/v1
|
||||||
- name: VLLM_SAFETY_URL
|
- name: VLLM_SAFETY_URL
|
||||||
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
|
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
|
||||||
- name: POSTGRES_HOST
|
- name: POSTGRES_HOST
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue