diff --git a/docs/source/distributions/k8s/delete.sh b/docs/source/distributions/k8s/delete.sh new file mode 100644 index 000000000..c095212e5 --- /dev/null +++ b/docs/source/distributions/k8s/delete.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +export POSTGRES_USER=llamastack +export POSTGRES_DB=llamastack +export POSTGRES_PASSWORD=llamastack + +export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct +export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B + +# Set USE_EBS to false if you don't have permission to use EKS EBS +export USE_EBS=${USE_EBS:-false} + +# HF_TOKEN should be set by the user; base64 encode it for the secret +if [ -n "${HF_TOKEN:-}" ]; then + export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64) +fi + +set -euo pipefail +set -x + +# Delete resources in reverse order of creation to handle dependencies properly + +# Delete UI deployment +envsubst < ./ui-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true + +# Delete ingress +envsubst < ./ingress-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true + +# Delete stack deployment +envsubst < ./stack-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true + +# Delete configmap +kubectl delete configmap llama-stack-config --ignore-not-found=true + +# Delete chroma deployment +envsubst < ./chroma-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true + +# Delete postgres deployment +envsubst < ./postgres-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true + +# Delete vllm-safety deployment +envsubst < ./vllm-safety-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true + +# Delete vllm deployment +envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true + +# Delete the HF token secret if it exists +if [ -n "${HF_TOKEN:-}" ]; then + envsubst < ./hf-token-secret.yaml.template | kubectl delete -f - --ignore-not-found=true +fi + +echo "All LlamaStack Kubernetes resources have been deleted." diff --git a/docs/source/distributions/k8s/llama-nim.yaml.template b/docs/source/distributions/k8s/llama-nim.yaml.template new file mode 100644 index 000000000..775e85629 --- /dev/null +++ b/docs/source/distributions/k8s/llama-nim.yaml.template @@ -0,0 +1,73 @@ +# ------------------------------------------------- +# NVIDIA NIM — Llama-3 1 Nemotron-Nano-4B-v1 1 +# ------------------------------------------------- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llama-nano-nim + labels: + app: llama-nano-nim +spec: + replicas: 1 + selector: + matchLabels: + app: llama-nano-nim + template: + metadata: + labels: + app: llama-nano-nim + spec: + imagePullSecrets: + - name: ngc-secret # docker-registry secret: nvcr.io / $oauthtoken / + volumes: + - name: model-cache + emptyDir: + medium: Memory # tmpfs; omit or use "" to back by node disk + sizeLimit: 12Gi # fits the 4 B model + tensors; adjust if needed + containers: + - name: nim + image: nvcr.io/nim/meta/llama-3_1-nemotron-nano-4b-v1_1:1.0.0 + ports: + - name: http-openai + containerPort: 8000 + resources: + limits: + nvidia.com/gpu: 1 + env: + - name: NIM_MODEL_NAME + value: "nvidia/llama-3_1-nemotron-nano-4b-v1_1" + - name: NGC_API_KEY + valueFrom: + secretKeyRef: + name: ngc-api + key: NGC_API_KEY + volumeMounts: + - name: model-cache + mountPath: /models # default NIM cache path + readinessProbe: + httpGet: + path: /v1/models + port: http-openai + initialDelaySeconds: 20 + periodSeconds: 10 + livenessProbe: + httpGet: + path: /v1/health + port: http-openai + initialDelaySeconds: 60 + periodSeconds: 30 + +--- +apiVersion: v1 +kind: Service +metadata: + name: llama-nano-nim +spec: + selector: + app: llama-nano-nim + ports: + - name: http-openai + port: 8000 + targetPort: 8000 + type: ClusterIP diff --git a/docs/source/distributions/k8s/stack-configmap.yaml b/docs/source/distributions/k8s/stack-configmap.yaml index 129471862..e69de29bb 100644 --- a/docs/source/distributions/k8s/stack-configmap.yaml +++ b/docs/source/distributions/k8s/stack-configmap.yaml @@ -1,128 +0,0 @@ -apiVersion: v1 -data: - stack_run_config.yaml: | - version: '2' - image_name: kubernetes-demo - apis: - - agents - - inference - - safety - - telemetry - - tool_runtime - - vector_io - providers: - inference: - - provider_id: vllm-inference - provider_type: remote::vllm - config: - url: ${env.VLLM_URL:=http://localhost:8000/v1} - max_tokens: ${env.VLLM_MAX_TOKENS:=4096} - api_token: ${env.VLLM_API_TOKEN:=fake} - tls_verify: ${env.VLLM_TLS_VERIFY:=true} - - provider_id: vllm-safety - provider_type: remote::vllm - config: - url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1} - max_tokens: ${env.VLLM_MAX_TOKENS:=4096} - api_token: ${env.VLLM_API_TOKEN:=fake} - tls_verify: ${env.VLLM_TLS_VERIFY:=true} - - provider_id: sentence-transformers - provider_type: inline::sentence-transformers - config: {} - vector_io: - - provider_id: ${env.ENABLE_CHROMADB:+chromadb} - provider_type: remote::chromadb - config: - url: ${env.CHROMADB_URL:=} - safety: - - provider_id: llama-guard - provider_type: inline::llama-guard - config: - excluded_categories: [] - agents: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - persistence_store: - type: postgres - host: ${env.POSTGRES_HOST:=localhost} - port: ${env.POSTGRES_PORT:=5432} - db: ${env.POSTGRES_DB:=llamastack} - user: ${env.POSTGRES_USER:=llamastack} - password: ${env.POSTGRES_PASSWORD:=llamastack} - responses_store: - type: postgres - host: ${env.POSTGRES_HOST:=localhost} - port: ${env.POSTGRES_PORT:=5432} - db: ${env.POSTGRES_DB:=llamastack} - user: ${env.POSTGRES_USER:=llamastack} - password: ${env.POSTGRES_PASSWORD:=llamastack} - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" - sinks: ${env.TELEMETRY_SINKS:=console} - tool_runtime: - - provider_id: brave-search - provider_type: remote::brave-search - config: - api_key: ${env.BRAVE_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: tavily-search - provider_type: remote::tavily-search - config: - api_key: ${env.TAVILY_SEARCH_API_KEY:+} - max_results: 3 - - provider_id: rag-runtime - provider_type: inline::rag-runtime - config: {} - - provider_id: model-context-protocol - provider_type: remote::model-context-protocol - config: {} - metadata_store: - type: postgres - host: ${env.POSTGRES_HOST:=localhost} - port: ${env.POSTGRES_PORT:=5432} - db: ${env.POSTGRES_DB:=llamastack} - user: ${env.POSTGRES_USER:=llamastack} - password: ${env.POSTGRES_PASSWORD:=llamastack} - table_name: llamastack_kvstore - inference_store: - type: postgres - host: ${env.POSTGRES_HOST:=localhost} - port: ${env.POSTGRES_PORT:=5432} - db: ${env.POSTGRES_DB:=llamastack} - user: ${env.POSTGRES_USER:=llamastack} - password: ${env.POSTGRES_PASSWORD:=llamastack} - models: - - metadata: - embedding_dimension: 384 - model_id: all-MiniLM-L6-v2 - provider_id: sentence-transformers - model_type: embedding - - metadata: {} - model_id: ${env.INFERENCE_MODEL} - provider_id: vllm-inference - model_type: llm - - metadata: {} - model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B} - provider_id: vllm-safety - model_type: llm - shields: - - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B} - vector_dbs: [] - datasets: [] - scoring_fns: [] - benchmarks: [] - tool_groups: - - toolgroup_id: builtin::websearch - provider_id: tavily-search - - toolgroup_id: builtin::rag - provider_id: rag-runtime - server: - port: 8321 -kind: ConfigMap -metadata: - creationTimestamp: null - name: llama-stack-config diff --git a/docs/source/distributions/k8s/stack-k8s.yaml.template b/docs/source/distributions/k8s/stack-k8s.yaml.template index 5cfd00425..0d6aba6f5 100644 --- a/docs/source/distributions/k8s/stack-k8s.yaml.template +++ b/docs/source/distributions/k8s/stack-k8s.yaml.template @@ -45,6 +45,8 @@ spec: value: http://vllm-server.default.svc.cluster.local:8000/v1 - name: VLLM_MAX_TOKENS value: "3072" + - name: NVIDIA_BASE_URL + value: http://llama-nano-nim.default.svc.cluster.local:8000/v1 - name: VLLM_SAFETY_URL value: http://vllm-server-safety.default.svc.cluster.local:8001/v1 - name: POSTGRES_HOST