add NIM k8s solution

2025-08-01 00:05:18 +00:00 · 2025-07-29 09:01:21 -07:00 · 2025-07-29 09:01:21 -07:00 · 8c0f328cbc
commit 8c0f328cbc
parent 95d25ddfe2
4 changed files with 133 additions and 128 deletions
--- a/docs/source/distributions/k8s/delete.sh
+++ b/docs/source/distributions/k8s/delete.sh
@ -0,0 +1,58 @@
 #!/usr/bin/env bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 export POSTGRES_USER=llamastack
 export POSTGRES_DB=llamastack
 export POSTGRES_PASSWORD=llamastack
 export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
 export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B
 # Set USE_EBS to false if you don't have permission to use EKS EBS
 export USE_EBS=${USE_EBS:-false}
 # HF_TOKEN should be set by the user; base64 encode it for the secret
 if [ -n "${HF_TOKEN:-}" ]; then
  export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64)
 fi
 set -euo pipefail
 set -x
 # Delete resources in reverse order of creation to handle dependencies properly
 # Delete UI deployment
 envsubst < ./ui-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 # Delete ingress
 envsubst < ./ingress-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 # Delete stack deployment
 envsubst < ./stack-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 # Delete configmap
 kubectl delete configmap llama-stack-config --ignore-not-found=true
 # Delete chroma deployment
 envsubst < ./chroma-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 # Delete postgres deployment
 envsubst < ./postgres-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 # Delete vllm-safety deployment
 envsubst < ./vllm-safety-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 # Delete vllm deployment
 envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 # Delete the HF token secret if it exists
 if [ -n "${HF_TOKEN:-}" ]; then
  envsubst < ./hf-token-secret.yaml.template | kubectl delete -f - --ignore-not-found=true
 fi
 echo "All LlamaStack Kubernetes resources have been deleted."
--- a/docs/source/distributions/k8s/llama-nim.yaml.template
+++ b/docs/source/distributions/k8s/llama-nim.yaml.template
@ -0,0 +1,73 @@
 # -------------------------------------------------
 # NVIDIA NIM — Llama-3 1 Nemotron-Nano-4B-v1 1
 # -------------------------------------------------
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: llama-nano-nim
  labels:
    app: llama-nano-nim
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: llama-nano-nim
  template:
    metadata:
      labels:
        app: llama-nano-nim
    spec:
      imagePullSecrets:
        - name: ngc-secret          # docker-registry secret: nvcr.io / $oauthtoken / <NGC_DOCKER_API_KEY>
      volumes:
        - name: model-cache
          emptyDir:
            medium: Memory          # tmpfs; omit or use "" to back by node disk
            sizeLimit: 12Gi          # fits the 4 B model + tensors; adjust if needed
      containers:
        - name: nim
          image: nvcr.io/nim/meta/llama-3_1-nemotron-nano-4b-v1_1:1.0.0
          ports:
            - name: http-openai
              containerPort: 8000
          resources:
            limits:
              nvidia.com/gpu: 1
          env:
            - name: NIM_MODEL_NAME
              value: "nvidia/llama-3_1-nemotron-nano-4b-v1_1"
            - name: NGC_API_KEY
              valueFrom:
                secretKeyRef:
                  name: ngc-api
                  key: NGC_API_KEY
          volumeMounts:
            - name: model-cache
              mountPath: /models       # default NIM cache path
          readinessProbe:
            httpGet:
              path: /v1/models
              port: http-openai
            initialDelaySeconds: 20
            periodSeconds: 10
          livenessProbe:
            httpGet:
              path: /v1/health
              port: http-openai
            initialDelaySeconds: 60
            periodSeconds: 30
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: llama-nano-nim
 spec:
  selector:
    app: llama-nano-nim
  ports:
    - name: http-openai
      port: 8000
      targetPort: 8000
  type: ClusterIP
--- a/docs/source/distributions/k8s/stack-configmap.yaml
+++ b/docs/source/distributions/k8s/stack-configmap.yaml
@ -1,128 +0,0 @@
 apiVersion: v1
 data:
  stack_run_config.yaml: |
    version: '2'
    image_name: kubernetes-demo
    apis:
    - agents
    - inference
    - safety
    - telemetry
    - tool_runtime
    - vector_io
    providers:
      inference:
      - provider_id: vllm-inference
        provider_type: remote::vllm
        config:
          url: ${env.VLLM_URL:=http://localhost:8000/v1}
          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
          api_token: ${env.VLLM_API_TOKEN:=fake}
          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
      - provider_id: vllm-safety
        provider_type: remote::vllm
        config:
          url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
          api_token: ${env.VLLM_API_TOKEN:=fake}
          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
      - provider_id: sentence-transformers
        provider_type: inline::sentence-transformers
        config: {}
      vector_io:
      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
        provider_type: remote::chromadb
        config:
          url: ${env.CHROMADB_URL:=}
      safety:
      - provider_id: llama-guard
        provider_type: inline::llama-guard
        config:
          excluded_categories: []
      agents:
      - provider_id: meta-reference
        provider_type: inline::meta-reference
        config:
          persistence_store:
            type: postgres
            host: ${env.POSTGRES_HOST:=localhost}
            port: ${env.POSTGRES_PORT:=5432}
            db: ${env.POSTGRES_DB:=llamastack}
            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
          responses_store:
            type: postgres
            host: ${env.POSTGRES_HOST:=localhost}
            port: ${env.POSTGRES_PORT:=5432}
            db: ${env.POSTGRES_DB:=llamastack}
            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
      telemetry:
      - provider_id: meta-reference
        provider_type: inline::meta-reference
        config:
          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
          sinks: ${env.TELEMETRY_SINKS:=console}
      tool_runtime:
      - provider_id: brave-search
        provider_type: remote::brave-search
        config:
          api_key: ${env.BRAVE_SEARCH_API_KEY:+}
          max_results: 3
      - provider_id: tavily-search
        provider_type: remote::tavily-search
        config:
          api_key: ${env.TAVILY_SEARCH_API_KEY:+}
          max_results: 3
      - provider_id: rag-runtime
        provider_type: inline::rag-runtime
        config: {}
      - provider_id: model-context-protocol
        provider_type: remote::model-context-protocol
        config: {}
    metadata_store:
      type: postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
      table_name: llamastack_kvstore
    inference_store:
      type: postgres
      host: ${env.POSTGRES_HOST:=localhost}
      port: ${env.POSTGRES_PORT:=5432}
      db: ${env.POSTGRES_DB:=llamastack}
      user: ${env.POSTGRES_USER:=llamastack}
      password: ${env.POSTGRES_PASSWORD:=llamastack}
    models:
    - metadata:
        embedding_dimension: 384
      model_id: all-MiniLM-L6-v2
      provider_id: sentence-transformers
      model_type: embedding
    - metadata: {}
      model_id: ${env.INFERENCE_MODEL}
      provider_id: vllm-inference
      model_type: llm
    - metadata: {}
      model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
      provider_id: vllm-safety
      model_type: llm
    shields:
    - shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}
    vector_dbs: []
    datasets: []
    scoring_fns: []
    benchmarks: []
    tool_groups:
    - toolgroup_id: builtin::websearch
      provider_id: tavily-search
    - toolgroup_id: builtin::rag
      provider_id: rag-runtime
    server:
      port: 8321
 kind: ConfigMap
 metadata:
  creationTimestamp: null
  name: llama-stack-config
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@ -45,6 +45,8 @@ spec:
          value: http://vllm-server.default.svc.cluster.local:8000/v1
        - name: VLLM_MAX_TOKENS
          value: "3072"
        - name: NVIDIA_BASE_URL
          value: http://llama-nano-nim.default.svc.cluster.local:8000/v1
        - name: VLLM_SAFETY_URL
          value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
        - name: POSTGRES_HOST