second try

This commit is contained in:
Kai Wu 2025-07-30 14:51:43 -07:00
parent 31a15332c4
commit 1cb9d3bca2
11 changed files with 237 additions and 64 deletions

View file

@ -6,12 +6,18 @@
# This source code is licensed under the terms described in the LICENSE file in # This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree. # the root directory of this source tree.
# Check if NGC_API_KEY is provided as argument
if [ -n "$1" ]; then
export NGC_API_KEY=$1
echo "Using NGC API key provided as argument."
fi
export POSTGRES_USER=llamastack export POSTGRES_USER=llamastack
export POSTGRES_DB=llamastack export POSTGRES_DB=llamastack
export POSTGRES_PASSWORD=llamastack export POSTGRES_PASSWORD=llamastack
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B export CODE_MODEL=bigcode/starcoder2-7b
# Set USE_EBS to false if you don't have permission to use EKS EBS # Set USE_EBS to false if you don't have permission to use EKS EBS
export USE_EBS=${USE_EBS:-false} export USE_EBS=${USE_EBS:-false}
@ -24,13 +30,16 @@ else
exit 1 exit 1
fi fi
if [ -z "${GITHUB_CLIENT_ID:-}" ]; then # NGC_API_KEY should be set by the user or provided as argument; base64 encode it for the secret
echo "ERROR: GITHUB_CLIENT_ID not set. You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide" if [ -n "${NGC_API_KEY:-}" ]; then
exit 1 export NGC_API_KEY_BASE64=$(echo -n "$NGC_API_KEY" | base64)
fi # Create Docker config JSON for NGC image pull
NGC_DOCKER_CONFIG="{\"auths\":{\"nvcr.io\":{\"username\":\"\$oauthtoken\",\"password\":\"$NGC_API_KEY\"}}}"
if [ -z "${GITHUB_CLIENT_SECRET:-}" ]; then export NGC_DOCKER_CONFIG_JSON=$(echo -n "$NGC_DOCKER_CONFIG" | base64)
echo "ERROR: GITHUB_CLIENT_SECRET not set. You need it for Github login to work. Refer to https://llama-stack.readthedocs.io/en/latest/deploying/index.html#kubernetes-deployment-guide" else
echo "ERROR: NGC_API_KEY not set. You need it for NIM to download models from NVIDIA."
echo "Usage: $0 [your-ngc-api-key]"
echo "You can either provide your NGC API key as an argument or set it as an environment variable."
exit 1 exit 1
fi fi
@ -41,20 +50,18 @@ fi
# Apply the HF token secret if HF_TOKEN is provided
if [ -n "${HF_TOKEN:-}" ]; then
envsubst < ./set-secret.yaml.template | kubectl apply -f -
fi
set -euo pipefail set -euo pipefail
set -x set -x
# Apply the HF token secret if HF_TOKEN is provided
if [ -n "${HF_TOKEN:-}" ]; then
envsubst < ./hf-token-secret.yaml.template | kubectl apply -f -
fi
# Apply templates with appropriate storage configuration based on USE_EBS setting # Apply templates with appropriate storage configuration based on USE_EBS setting
if [ "$USE_EBS" = "true" ]; then if [ "$USE_EBS" = "true" ]; then
echo "Using EBS storage for persistent volumes" echo "Using EBS storage for persistent volumes"
envsubst < ./vllm-k8s.yaml.template | kubectl apply -f - envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f - envsubst < ./llama-nim.yaml.template | kubectl apply -f -
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f - envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
envsubst < ./chroma-k8s.yaml.template | kubectl apply -f - envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
@ -70,7 +77,7 @@ else
echo "Using emptyDir for storage (data will not persist across pod restarts)" echo "Using emptyDir for storage (data will not persist across pod restarts)"
# Process templates to replace EBS storage with emptyDir # Process templates to replace EBS storage with emptyDir
envsubst < ./vllm-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - envsubst < ./vllm-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
envsubst < ./vllm-safety-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - envsubst < ./llama-nim.yaml.template | kubectl apply -f -
envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -

14
docs/source/distributions/k8s/delete.sh Normal file → Executable file
View file

@ -21,6 +21,14 @@ if [ -n "${HF_TOKEN:-}" ]; then
export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64) export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64)
fi fi
# NGC_API_KEY should be set by the user; base64 encode it for the secret
if [ -n "${NGC_API_KEY:-}" ]; then
export NGC_API_KEY_BASE64=$(echo -n "$NGC_API_KEY" | base64)
# Create Docker config JSON for NGC image pull
NGC_DOCKER_CONFIG="{\"auths\":{\"nvcr.io\":{\"username\":\"\$oauthtoken\",\"password\":\"$NGC_API_KEY\"}}}"
export NGC_DOCKER_CONFIG_JSON=$(echo -n "$NGC_DOCKER_CONFIG" | base64)
fi
set -euo pipefail set -euo pipefail
set -x set -x
@ -45,14 +53,16 @@ envsubst < ./chroma-k8s.yaml.template | kubectl delete -f - --ignore-not-found=t
envsubst < ./postgres-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true envsubst < ./postgres-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
# Delete vllm-safety deployment # Delete vllm-safety deployment
envsubst < ./vllm-safety-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true envsubst < ./llama-nim.yaml.template | kubectl delete -f - --ignore-not-found=true
# Delete vllm deployment # Delete vllm deployment
envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
# Delete the HF token secret if it exists # Delete the HF token secret if it exists
if [ -n "${HF_TOKEN:-}" ]; then if [ -n "${HF_TOKEN:-}" ]; then
envsubst < ./hf-token-secret.yaml.template | kubectl delete -f - --ignore-not-found=true envsubst < ./set-secret.yaml.template | kubectl delete -f - --ignore-not-found=true
fi fi
# NGC API key secrets are now part of llama-nim.yaml.template and are deleted with it
echo "All LlamaStack Kubernetes resources have been deleted." echo "All LlamaStack Kubernetes resources have been deleted."

View file

@ -1,7 +0,0 @@
apiVersion: v1
kind: Secret
metadata:
name: hf-token-secret
type: Opaque
data:
token: ${HF_TOKEN_BASE64}

View file

@ -1,25 +1,25 @@
# ------------------------------------------------- # -------------------------------------------------
# NVIDIA NIM — Llama-3 1 Nemotron-Nano-4B-v1 1 # NVIDIA NIM - Code
# ------------------------------------------------- # -------------------------------------------------
apiVersion: apps/v1 apiVersion: apps/v1
kind: Deployment kind: Deployment
metadata: metadata:
name: llama-nano-nim name: llm-nim-code
labels: labels:
app: llama-nano-nim app: llm-nim-code
spec: spec:
replicas: 1 replicas: 1
selector: selector:
matchLabels: matchLabels:
app: llama-nano-nim app: llm-nim-code
template: template:
metadata: metadata:
labels: labels:
app: llama-nano-nim app: llm-nim-code
spec: spec:
imagePullSecrets: imagePullSecrets:
- name: ngc-secret # docker-registry secret: nvcr.io / $oauthtoken / <NGC_DOCKER_API_KEY> - name: ngc-docker-registry # docker-registry secret: nvcr.io / $oauthtoken / <NGC_DOCKER_API_KEY>
volumes: volumes:
- name: model-cache - name: model-cache
emptyDir: emptyDir:
@ -27,7 +27,7 @@ spec:
sizeLimit: 12Gi # fits the 4 B model + tensors; adjust if needed sizeLimit: 12Gi # fits the 4 B model + tensors; adjust if needed
containers: containers:
- name: nim - name: nim
image: nvcr.io/nim/meta/llama-3_1-nemotron-nano-4b-v1_1:1.0.0 image: nvcr.io/nim/bigcode/starcoder2-7b:1.8.1
ports: ports:
- name: http-openai - name: http-openai
containerPort: 8000 containerPort: 8000
@ -36,7 +36,7 @@ spec:
nvidia.com/gpu: 1 nvidia.com/gpu: 1
env: env:
- name: NIM_MODEL_NAME - name: NIM_MODEL_NAME
value: "nvidia/llama-3_1-nemotron-nano-4b-v1_1" value: "nvidia/starcoder2-7b"
- name: NGC_API_KEY - name: NGC_API_KEY
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
@ -49,23 +49,23 @@ spec:
httpGet: httpGet:
path: /v1/models path: /v1/models
port: http-openai port: http-openai
initialDelaySeconds: 20 initialDelaySeconds: 360
periodSeconds: 10 periodSeconds: 360
livenessProbe: livenessProbe:
httpGet: httpGet:
path: /v1/health path: /v1/health
port: http-openai port: http-openai
initialDelaySeconds: 60 initialDelaySeconds: 600
periodSeconds: 30 periodSeconds: 360
--- ---
apiVersion: v1 apiVersion: v1
kind: Service kind: Service
metadata: metadata:
name: llama-nano-nim name: llm-nim-code
spec: spec:
selector: selector:
app: llama-nano-nim app: llm-nim-code
ports: ports:
- name: http-openai - name: http-openai
port: 8000 port: 8000

View file

@ -0,0 +1,31 @@
apiVersion: v1
kind: Secret
metadata:
name: hf-token-secret
type: Opaque
data:
token: ${HF_TOKEN_BASE64}
# -------------------------------------------------
# NGC Docker Registry Secret
# -------------------------------------------------
apiVersion: apps/v1
kind: Secret
metadata:
name: ngc-docker-registry
type: kubernetes.io/dockerconfigjson
data:
.dockerconfigjson: ${NGC_DOCKER_CONFIG_JSON}
# -------------------------------------------------
# NGC API Secret
# -------------------------------------------------
apiVersion: apps/v1
kind: Secret
metadata:
name: ngc-api
type: Opaque
data:
NGC_API_KEY: ${NGC_API_KEY_BASE64}

View file

@ -0,0 +1,125 @@
apiVersion: v1
data:
stack_run_config.yaml: |
version: '2'
image_name: kubernetes-demo
apis:
- agents
- inference
- safety
- telemetry
- tool_runtime
- vector_io
providers:
inference:
- provider_id: vllm-inference
provider_type: remote::vllm
config:
url: ${env.VLLM_URL:=http://localhost:8000/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: nvidia
provider_type: remote::nvidia
config:
url: ${env.NVIDIA_BASE_URL:=http://localhost:8001/v1}
api_key: ${env.NVIDIA_API_KEY:=}
append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
vector_io:
- provider_id: ${env.ENABLE_CHROMADB:+chromadb}
provider_type: remote::chromadb
config:
url: ${env.CHROMADB_URL:=}
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config:
excluded_categories: []
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
responses_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
sinks: ${env.TELEMETRY_SINKS:=console}
tool_runtime:
- provider_id: brave-search
provider_type: remote::brave-search
config:
api_key: ${env.BRAVE_SEARCH_API_KEY:+}
max_results: 3
- provider_id: tavily-search
provider_type: remote::tavily-search
config:
api_key: ${env.TAVILY_SEARCH_API_KEY:+}
max_results: 3
- provider_id: rag-runtime
provider_type: inline::rag-runtime
config: {}
- provider_id: model-context-protocol
provider_type: remote::model-context-protocol
config: {}
metadata_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
table_name: llamastack_kvstore
inference_store:
type: postgres
host: ${env.POSTGRES_HOST:=localhost}
port: ${env.POSTGRES_PORT:=5432}
db: ${env.POSTGRES_DB:=llamastack}
user: ${env.POSTGRES_USER:=llamastack}
password: ${env.POSTGRES_PASSWORD:=llamastack}
models:
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
model_type: embedding
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: vllm-inference
model_type: llm
- metadata: {}
model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b}
provider_id: nvidia
model_type: llm
vector_dbs: []
datasets: []
scoring_fns: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: tavily-search
- toolgroup_id: builtin::rag
provider_id: rag-runtime
server:
port: 8321
kind: ConfigMap
metadata:
creationTimestamp: null
name: llama-stack-config

View file

@ -28,27 +28,34 @@ spec:
initContainers: initContainers:
- name: wait-for-vllm-server - name: wait-for-vllm-server
image: busybox:1.28 image: busybox:1.28
command: ['sh', '-c', 'until nc -z vllm-server.default.svc.cluster.local 8000; do echo waiting for vllm-server on port 8000; sleep 2; done;'] command: ['sh', '-c', 'until nc -z vllm-server.default.svc.cluster.local 8001; do echo waiting for vllm-server on port 8000; sleep 2; done;']
- name: wait-for-vllm-server-safety - name: wait-for-llm-nim-code
image: busybox:1.28 image: busybox:1.28
command: ['sh', '-c', 'until nc -z vllm-server-safety.default.svc.cluster.local 8001; do echo waiting for vllm-server-safety on port 8001; sleep 2; done;'] command: ['sh', '-c', 'until nc -z llm-nim-code.default.svc.cluster.local 8000; do echo waiting for llm-nim-code on port 8001; sleep 2; done;']
containers: containers:
- name: llama-stack - name: llama-stack
image: llamastack/distribution-starter:latest image: llamastack/distribution-starter:latest
imagePullPolicy: Always # since we have specified latest instead of a version imagePullPolicy: Always # since we have specified latest instead of a version
resources:
requests:
memory: "512Mi"
cpu: "500m"
ephemeral-storage: "2Gi"
limits:
memory: "1Gi"
cpu: "1000m"
ephemeral-storage: "5Gi"
env: env:
- name: ENABLE_CHROMADB - name: ENABLE_CHROMADB
value: "true" value: "true"
- name: CHROMADB_URL - name: CHROMADB_URL
value: http://chromadb.default.svc.cluster.local:6000 value: http://chromadb.default.svc.cluster.local:6000
- name: VLLM_URL - name: VLLM_URL
value: http://vllm-server.default.svc.cluster.local:8000/v1 value: http://vllm-server.default.svc.cluster.local:8001/v1
- name: VLLM_MAX_TOKENS - name: VLLM_MAX_TOKENS
value: "3072" value: "3072"
- name: NVIDIA_BASE_URL - name: NVIDIA_BASE_URL
value: http://llama-nano-nim.default.svc.cluster.local:8000/v1 value: http://llm-nim-code.default.svc.cluster.local:8000/v1
- name: VLLM_SAFETY_URL
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
- name: POSTGRES_HOST - name: POSTGRES_HOST
value: postgres-server.default.svc.cluster.local value: postgres-server.default.svc.cluster.local
- name: POSTGRES_PORT - name: POSTGRES_PORT
@ -57,8 +64,8 @@ spec:
value: "false" value: "false"
- name: INFERENCE_MODEL - name: INFERENCE_MODEL
value: "${INFERENCE_MODEL}" value: "${INFERENCE_MODEL}"
- name: SAFETY_MODEL - name: CODE_MODEL
value: "${SAFETY_MODEL}" value: "${CODE_MODEL}"
- name: TAVILY_SEARCH_API_KEY - name: TAVILY_SEARCH_API_KEY
value: "${TAVILY_SEARCH_API_KEY}" value: "${TAVILY_SEARCH_API_KEY}"
command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"] command: ["python", "-m", "llama_stack.distribution.server.server", "--config", "/etc/config/stack_run_config.yaml", "--port", "8321"]

View file

@ -16,13 +16,12 @@ providers:
max_tokens: ${env.VLLM_MAX_TOKENS:=4096} max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
api_token: ${env.VLLM_API_TOKEN:=fake} api_token: ${env.VLLM_API_TOKEN:=fake}
tls_verify: ${env.VLLM_TLS_VERIFY:=true} tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: vllm-safety - provider_id: nvidia
provider_type: remote::vllm provider_type: remote::nvidia
config: config:
url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1} url: ${env.NVIDIA_BASE_URL:=http://localhost:8001/v1}
max_tokens: ${env.VLLM_MAX_TOKENS:=4096} api_key: ${env.NVIDIA_API_KEY:=}
api_token: ${env.VLLM_API_TOKEN:=fake} append_api_version: ${env.NVIDIA_APPEND_API_VERSION:=True}
tls_verify: ${env.VLLM_TLS_VERIFY:=true}
- provider_id: sentence-transformers - provider_id: sentence-transformers
provider_type: inline::sentence-transformers provider_type: inline::sentence-transformers
config: {} config: {}
@ -103,11 +102,9 @@ models:
provider_id: vllm-inference provider_id: vllm-inference
model_type: llm model_type: llm
- metadata: {} - metadata: {}
model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B} model_id: ${env.CODE_MODEL:=bigcode/starcoder2-7b}
provider_id: vllm-safety provider_id: nvidia
model_type: llm model_type: llm
shields:
- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
vector_dbs: [] vector_dbs: []
datasets: [] datasets: []
scoring_fns: [] scoring_fns: []

View file

@ -34,7 +34,7 @@ spec:
image: vllm/vllm-openai:latest image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"] command: ["/bin/sh", "-c"]
args: args:
- "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.7 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 4" - "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.7 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 4 --port 8001"
env: env:
- name: INFERENCE_MODEL - name: INFERENCE_MODEL
value: "${INFERENCE_MODEL}" value: "${INFERENCE_MODEL}"
@ -44,7 +44,7 @@ spec:
name: hf-token-secret name: hf-token-secret
key: token key: token
ports: ports:
- containerPort: 8000 - containerPort: 8001
resources: resources:
limits: limits:
nvidia.com/gpu: 1 nvidia.com/gpu: 1
@ -67,6 +67,6 @@ spec:
app.kubernetes.io/name: vllm app.kubernetes.io/name: vllm
ports: ports:
- protocol: TCP - protocol: TCP
port: 8000 port: 8001
targetPort: 8000 targetPort: 8001
type: ClusterIP type: ClusterIP

View file

@ -273,6 +273,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference, ModelRegistryHelper, Models
response_format: ResponseFormat | None = None, response_format: ResponseFormat | None = None,
stream: bool | None = False, stream: bool | None = False,
logprobs: LogProbConfig | None = None, logprobs: LogProbConfig | None = None,
suffix: str | None = None,
) -> CompletionResponse | AsyncIterator[CompletionResponseStreamChunk]: ) -> CompletionResponse | AsyncIterator[CompletionResponseStreamChunk]:
if sampling_params is None: if sampling_params is None:
sampling_params = SamplingParams() sampling_params = SamplingParams()
@ -293,6 +294,7 @@ class NVIDIAInferenceAdapter(OpenAIMixin, Inference, ModelRegistryHelper, Models
response_format=response_format, response_format=response_format,
stream=stream, stream=stream,
logprobs=logprobs, logprobs=logprobs,
suffix=suffix,
), ),
n=1, n=1,
) )

View file

@ -155,7 +155,8 @@ def convert_completion_request(
if request.logprobs: if request.logprobs:
payload.update(logprobs=request.logprobs.top_k) payload.update(logprobs=request.logprobs.top_k)
if request.suffix:
payload.update(suffix=request.suffix)
if request.sampling_params: if request.sampling_params:
nvext.update(repetition_penalty=request.sampling_params.repetition_penalty) nvext.update(repetition_penalty=request.sampling_params.repetition_penalty)