fix: update k8s templates (#2645)

# What does this PR do? - fix env variables - use gpu for vllm - add eks/apply.py for aws - add template to set hf secret ## Test Plan bash apply.sh Co-authored-by: Eric Huang <erichuang@fb.com>
2025-12-04 18:13:44 +00:00 · 2025-07-08 15:57:01 -07:00 · 2025-07-08 15:57:01 -07:00 · 84fa83b788
commit 84fa83b788
parent daf660c4ea
9 changed files with 100 additions and 32 deletions
--- a/docs/source/distributions/eks/apply.sh
+++ b/docs/source/distributions/eks/apply.sh
@ -0,0 +1,19 @@
 #!/usr/bin/env bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 set -euo pipefail
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 K8S_DIR="${SCRIPT_DIR}/../k8s"
 echo "Setting up AWS EKS-specific storage class..."
 kubectl apply -f gp3-topology-aware.yaml
 echo "Running main Kubernetes deployment..."
 cd "${K8S_DIR}"
 ./apply.sh "$@"
--- a/docs/source/distributions/eks/gp3-topology-aware.yaml
+++ b/docs/source/distributions/eks/gp3-topology-aware.yaml
@ -0,0 +1,15 @@
 # Set up default storage class on AWS EKS
 apiVersion: storage.k8s.io/v1
 kind: StorageClass
 metadata:
  name: gp3-topology-aware
  annotations:
    storageclass.kubernetes.io/is-default-class: "true"
 parameters:
  type: gp3
  iops: "3000"
  throughput: "125"
 provisioner: ebs.csi.aws.com
 reclaimPolicy: Delete
 volumeBindingMode: WaitForFirstConsumer
 allowVolumeExpansion: true
--- a/docs/source/distributions/k8s/apply.sh
+++ b/docs/source/distributions/k8s/apply.sh
@ -13,9 +13,22 @@ export POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-llamastack}
 export INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
 export SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}
 # HF_TOKEN should be set by the user; base64 encode it for the secret
 if [ -n "${HF_TOKEN:-}" ]; then
  export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64)
 else
  echo "ERROR: HF_TOKEN not set. You need it for vLLM to download models from Hugging Face."
  exit 1
 fi
 set -euo pipefail
 set -x
 # Apply the HF token secret if HF_TOKEN is provided
 if [ -n "${HF_TOKEN:-}" ]; then
  envsubst < ./hf-token-secret.yaml.template | kubectl apply -f -
 fi
 envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
 envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
 envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
--- a/docs/source/distributions/k8s/hf-token-secret.yaml.template
+++ b/docs/source/distributions/k8s/hf-token-secret.yaml.template
@ -0,0 +1,7 @@
 apiVersion: v1
 kind: Secret
 metadata:
  name: hf-token-secret
 type: Opaque
 data:
  token: ${HF_TOKEN_BASE64}
--- a/docs/source/distributions/k8s/stack-configmap.yaml
+++ b/docs/source/distributions/k8s/stack-configmap.yaml
@ -22,10 +22,10 @@ data:
      - provider_id: vllm-safety
        provider_type: remote::vllm
        config:
-          url: ${env.VLLM_SAFETY_URL:http://localhost:8000/v1}
+          url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
-          max_tokens: ${env.VLLM_MAX_TOKENS:4096}
+          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
-          api_token: ${env.VLLM_API_TOKEN:fake}
+          api_token: ${env.VLLM_API_TOKEN:=fake}
-          tls_verify: ${env.VLLM_TLS_VERIFY:true}
+          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
      - provider_id: sentence-transformers
        provider_type: inline::sentence-transformers
        config: {}
@ -33,7 +33,7 @@ data:
      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
        provider_type: remote::chromadb
        config:
-          url: ${env.CHROMADB_URL:+}
+          url: ${env.CHROMADB_URL:=}
      safety:
      - provider_id: llama-guard
        provider_type: inline::llama-guard
@ -48,7 +48,7 @@ data:
            host: ${env.POSTGRES_HOST:=localhost}
            port: ${env.POSTGRES_PORT:=5432}
            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:llamastack}
+            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
          responses_store:
            type: postgres
@ -61,8 +61,8 @@ data:
      - provider_id: meta-reference
        provider_type: inline::meta-reference
        config:
-          service_name: ${env.OTEL_SERVICE_NAME:+}
+          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-          sinks: ${env.TELEMETRY_SINKS:console}
+          sinks: ${env.TELEMETRY_SINKS:=console}
      tool_runtime:
      - provider_id: brave-search
        provider_type: remote::brave-search
--- a/docs/source/distributions/k8s/stack_run_config.yaml
+++ b/docs/source/distributions/k8s/stack_run_config.yaml
@ -30,7 +30,7 @@ providers:
  - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
    provider_type: remote::chromadb
    config:
-      url: ${env.CHROMADB_URL:+}
+      url: ${env.CHROMADB_URL:=}
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
@ -58,8 +58,8 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:+console}
+      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:+console}
+      sinks: ${env.TELEMETRY_SINKS:=console}
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
--- a/docs/source/distributions/k8s/vllm-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template
@ -25,16 +25,8 @@ spec:
        app.kubernetes.io/name: vllm
        workload-type: inference
    spec:
-      affinity:
+      nodeSelector:
-        podAntiAffinity:
+        eks.amazonaws.com/nodegroup: gpu
          requiredDuringSchedulingIgnoredDuringExecution:
          - labelSelector:
              matchExpressions:
              - key: workload-type
                operator: In
                values:
                - inference
            topologyKey: kubernetes.io/hostname  # Ensures no two inference pods on same node
      containers:
      - name: vllm
        image: vllm/vllm-openai:latest
@ -42,6 +34,8 @@ spec:
        args:
        - "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.6"
        env:
        - name: INFERENCE_MODEL
          value: "${INFERENCE_MODEL}"
        - name: HUGGING_FACE_HUB_TOKEN
          valueFrom:
            secretKeyRef:
@ -49,6 +43,11 @@ spec:
              key: token
        ports:
          - containerPort: 8000
        resources:
          limits:
            nvidia.com/gpu: 1
          requests:
            nvidia.com/gpu: 1
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.cache/huggingface
--- a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template
@ -6,7 +6,6 @@ spec:
  accessModes:
    - ReadWriteOnce
  volumeMode: Filesystem
  storageClassName: gp2
  resources:
    requests:
      storage: 30Gi
@ -26,16 +25,8 @@ spec:
        app.kubernetes.io/name: vllm-safety
        workload-type: inference
    spec:
-      affinity:
+      nodeSelector:
-        podAntiAffinity:
+        eks.amazonaws.com/nodegroup: gpu
          requiredDuringSchedulingIgnoredDuringExecution:
          - labelSelector:
              matchExpressions:
              - key: workload-type
                operator: In
                values:
                - inference
            topologyKey: kubernetes.io/hostname  # Ensures no two inference pods on same node
      containers:
      - name: vllm-safety
        image: vllm/vllm-openai:latest
@ -44,6 +35,8 @@ spec:
          "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.3"
        ]
        env:
        - name: SAFETY_MODEL
          value: "${SAFETY_MODEL}"
        - name: HUGGING_FACE_HUB_TOKEN
          valueFrom:
            secretKeyRef:
@ -51,6 +44,11 @@ spec:
              key: token
        ports:
          - containerPort: 8001
        resources:
          limits:
            nvidia.com/gpu: 1
          requests:
            nvidia.com/gpu: 1
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.cache/huggingface
--- a/docs/source/distributions/kubernetes_deployment.md
+++ b/docs/source/distributions/kubernetes_deployment.md
@ -5,6 +5,8 @@ Instead of starting the Llama Stack and vLLM servers locally. We can deploy them
 ### Prerequisites
 In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.
 Note: You can also deploy the Llama Stack server in an AWS EKS cluster. See [Deploying Llama Stack Server in AWS EKS](#deploying-llama-stack-server-in-aws-eks) for more details.
 First, create a local Kubernetes cluster via Kind:
 ```
@ -217,3 +219,18 @@ Finally, we forward the Kubernetes service to a local port and test some inferen
 kubectl port-forward service/llama-stack-service 5000:5000
 llama-stack-client --endpoint http://localhost:5000 inference chat-completion --message "hello, what model are you?"
 ```
 ## Deploying Llama Stack Server in AWS EKS
 We've also provided a script to deploy the Llama Stack server in an AWS EKS cluster. Once you have an [EKS cluster](https://docs.aws.amazon.com/eks/latest/userguide/getting-started.html), you can run the following script to deploy the Llama Stack server.
 ```
 cd docs/source/distributions/eks
 ./apply.sh
 ```
 This script will:
 - Set up a default storage class for AWS EKS
 - Deploy the Llama Stack server in a Kubernetes Pod and Service