fix: update k8s templates (#2645)

# What does this PR do? - fix env variables - use gpu for vllm - add eks/apply.py for aws - add template to set hf secret ## Test Plan bash apply.sh Co-authored-by: Eric Huang <erichuang@fb.com>
2025-07-13 16:46:09 +00:00 · 2025-07-08 15:57:01 -07:00 · 2025-07-08 15:57:01 -07:00 · 84fa83b788
commit 84fa83b788
parent daf660c4ea
9 changed files with 100 additions and 32 deletions
--- a/docs/source/distributions/eks/apply.sh
+++ b/docs/source/distributions/eks/apply.sh
@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the terms described in the LICENSE file in
+# the root directory of this source tree.
+
+set -euo pipefail
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+K8S_DIR="${SCRIPT_DIR}/../k8s"
+
+echo "Setting up AWS EKS-specific storage class..."
+kubectl apply -f gp3-topology-aware.yaml
+
+echo "Running main Kubernetes deployment..."
+cd "${K8S_DIR}"
+./apply.sh "$@"
--- a/docs/source/distributions/eks/gp3-topology-aware.yaml
+++ b/docs/source/distributions/eks/gp3-topology-aware.yaml
@ -0,0 +1,15 @@
+# Set up default storage class on AWS EKS
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: gp3-topology-aware
+  annotations:
+    storageclass.kubernetes.io/is-default-class: "true"
+parameters:
+  type: gp3
+  iops: "3000"
+  throughput: "125"
+provisioner: ebs.csi.aws.com
+reclaimPolicy: Delete
+volumeBindingMode: WaitForFirstConsumer
+allowVolumeExpansion: true
--- a/docs/source/distributions/k8s/apply.sh
+++ b/docs/source/distributions/k8s/apply.sh
@ -13,9 +13,22 @@ export POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-llamastack}
 export INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct}
 export SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B}

+# HF_TOKEN should be set by the user; base64 encode it for the secret
+if [ -n "${HF_TOKEN:-}" ]; then
+  export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64)
+else
+  echo "ERROR: HF_TOKEN not set. You need it for vLLM to download models from Hugging Face."
+  exit 1
+fi
+
 set -euo pipefail
 set -x

+# Apply the HF token secret if HF_TOKEN is provided
+if [ -n "${HF_TOKEN:-}" ]; then
+  envsubst < ./hf-token-secret.yaml.template | kubectl apply -f -
+fi
+
 envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
 envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
 envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
--- a/docs/source/distributions/k8s/hf-token-secret.yaml.template
+++ b/docs/source/distributions/k8s/hf-token-secret.yaml.template
@ -0,0 +1,7 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: hf-token-secret
+type: Opaque
+data:
+  token: ${HF_TOKEN_BASE64}
--- a/docs/source/distributions/k8s/stack-configmap.yaml
+++ b/docs/source/distributions/k8s/stack-configmap.yaml
@ -22,10 +22,10 @@ data:
      - provider_id: vllm-safety
        provider_type: remote::vllm
        config:
-          url: ${env.VLLM_SAFETY_URL:http://localhost:8000/v1}
-          max_tokens: ${env.VLLM_MAX_TOKENS:4096}
-          api_token: ${env.VLLM_API_TOKEN:fake}
-          tls_verify: ${env.VLLM_TLS_VERIFY:true}
+          url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}
+          max_tokens: ${env.VLLM_MAX_TOKENS:=4096}
+          api_token: ${env.VLLM_API_TOKEN:=fake}
+          tls_verify: ${env.VLLM_TLS_VERIFY:=true}
      - provider_id: sentence-transformers
        provider_type: inline::sentence-transformers
        config: {}
@ -33,7 +33,7 @@ data:
      - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
        provider_type: remote::chromadb
        config:
-          url: ${env.CHROMADB_URL:+}
+          url: ${env.CHROMADB_URL:=}
      safety:
      - provider_id: llama-guard
        provider_type: inline::llama-guard
@ -48,7 +48,7 @@ data:
            host: ${env.POSTGRES_HOST:=localhost}
            port: ${env.POSTGRES_PORT:=5432}
            db: ${env.POSTGRES_DB:=llamastack}
-            user: ${env.POSTGRES_USER:llamastack}
+            user: ${env.POSTGRES_USER:=llamastack}
            password: ${env.POSTGRES_PASSWORD:=llamastack}
          responses_store:
            type: postgres
@ -61,8 +61,8 @@ data:
      - provider_id: meta-reference
        provider_type: inline::meta-reference
        config:
-          service_name: ${env.OTEL_SERVICE_NAME:+}
-          sinks: ${env.TELEMETRY_SINKS:console}
+          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
+          sinks: ${env.TELEMETRY_SINKS:=console}
      tool_runtime:
      - provider_id: brave-search
        provider_type: remote::brave-search
--- a/docs/source/distributions/k8s/stack_run_config.yaml
+++ b/docs/source/distributions/k8s/stack_run_config.yaml
@ -30,7 +30,7 @@ providers:
  - provider_id: ${env.ENABLE_CHROMADB:+chromadb}
    provider_type: remote::chromadb
    config:
-      url: ${env.CHROMADB_URL:+}
+      url: ${env.CHROMADB_URL:=}
  safety:
  - provider_id: llama-guard
    provider_type: inline::llama-guard
@ -58,8 +58,8 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: ${env.OTEL_SERVICE_NAME:+console}
-      sinks: ${env.TELEMETRY_SINKS:+console}
+      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
+      sinks: ${env.TELEMETRY_SINKS:=console}
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
--- a/docs/source/distributions/k8s/vllm-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template
@ -25,16 +25,8 @@ spec:
        app.kubernetes.io/name: vllm
        workload-type: inference
    spec:
-      affinity:
-        podAntiAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-          - labelSelector:
-              matchExpressions:
-              - key: workload-type
-                operator: In
-                values:
-                - inference
-            topologyKey: kubernetes.io/hostname  # Ensures no two inference pods on same node
+      nodeSelector:
+        eks.amazonaws.com/nodegroup: gpu
      containers:
      - name: vllm
        image: vllm/vllm-openai:latest
@ -42,6 +34,8 @@ spec:
        args:
        - "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.6"
        env:
+        - name: INFERENCE_MODEL
+          value: "${INFERENCE_MODEL}"
        - name: HUGGING_FACE_HUB_TOKEN
          valueFrom:
            secretKeyRef:
@ -49,6 +43,11 @@ spec:
              key: token
        ports:
          - containerPort: 8000
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+          requests:
+            nvidia.com/gpu: 1
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.cache/huggingface
--- a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template
@ -6,7 +6,6 @@ spec:
  accessModes:
    - ReadWriteOnce
  volumeMode: Filesystem
-  storageClassName: gp2
  resources:
    requests:
      storage: 30Gi
@ -26,16 +25,8 @@ spec:
        app.kubernetes.io/name: vllm-safety
        workload-type: inference
    spec:
-      affinity:
-        podAntiAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-          - labelSelector:
-              matchExpressions:
-              - key: workload-type
-                operator: In
-                values:
-                - inference
-            topologyKey: kubernetes.io/hostname  # Ensures no two inference pods on same node
+      nodeSelector:
+        eks.amazonaws.com/nodegroup: gpu
      containers:
      - name: vllm-safety
        image: vllm/vllm-openai:latest
@ -44,6 +35,8 @@ spec:
          "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.3"
        ]
        env:
+        - name: SAFETY_MODEL
+          value: "${SAFETY_MODEL}"
        - name: HUGGING_FACE_HUB_TOKEN
          valueFrom:
            secretKeyRef:
@ -51,6 +44,11 @@ spec:
              key: token
        ports:
          - containerPort: 8001
+        resources:
+          limits:
+            nvidia.com/gpu: 1
+          requests:
+            nvidia.com/gpu: 1
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.cache/huggingface
--- a/docs/source/distributions/kubernetes_deployment.md
+++ b/docs/source/distributions/kubernetes_deployment.md
@ -5,6 +5,8 @@ Instead of starting the Llama Stack and vLLM servers locally. We can deploy them
 ### Prerequisites
 In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes.

+Note: You can also deploy the Llama Stack server in an AWS EKS cluster. See [Deploying Llama Stack Server in AWS EKS](#deploying-llama-stack-server-in-aws-eks) for more details.
+
 First, create a local Kubernetes cluster via Kind:

 ```
@ -217,3 +219,18 @@ Finally, we forward the Kubernetes service to a local port and test some inferen
 kubectl port-forward service/llama-stack-service 5000:5000
 llama-stack-client --endpoint http://localhost:5000 inference chat-completion --message "hello, what model are you?"
 ```
+
+## Deploying Llama Stack Server in AWS EKS
+
+We've also provided a script to deploy the Llama Stack server in an AWS EKS cluster. Once you have an [EKS cluster](https://docs.aws.amazon.com/eks/latest/userguide/getting-started.html), you can run the following script to deploy the Llama Stack server.
+
+
+```
+cd docs/source/distributions/eks
+./apply.sh
+```
+
+This script will:
+
+- Set up a default storage class for AWS EKS
+- Deploy the Llama Stack server in a Kubernetes Pod and Service