diff --git a/docs/source/distributions/eks/apply.sh b/docs/source/distributions/eks/apply.sh new file mode 100755 index 000000000..3ad3dd263 --- /dev/null +++ b/docs/source/distributions/eks/apply.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +set -euo pipefail + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +K8S_DIR="${SCRIPT_DIR}/../k8s" + +echo "Setting up AWS EKS-specific storage class..." +kubectl apply -f gp3-topology-aware.yaml + +echo "Running main Kubernetes deployment..." +cd "${K8S_DIR}" +./apply.sh "$@" diff --git a/docs/source/distributions/eks/gp3-topology-aware.yaml b/docs/source/distributions/eks/gp3-topology-aware.yaml new file mode 100644 index 000000000..1192ba18c --- /dev/null +++ b/docs/source/distributions/eks/gp3-topology-aware.yaml @@ -0,0 +1,15 @@ +# Set up default storage class on AWS EKS +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: gp3-topology-aware + annotations: + storageclass.kubernetes.io/is-default-class: "true" +parameters: + type: gp3 + iops: "3000" + throughput: "125" +provisioner: ebs.csi.aws.com +reclaimPolicy: Delete +volumeBindingMode: WaitForFirstConsumer +allowVolumeExpansion: true diff --git a/docs/source/distributions/k8s/apply.sh b/docs/source/distributions/k8s/apply.sh index 7ff7d28eb..06b1ea10c 100755 --- a/docs/source/distributions/k8s/apply.sh +++ b/docs/source/distributions/k8s/apply.sh @@ -13,9 +13,22 @@ export POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-llamastack} export INFERENCE_MODEL=${INFERENCE_MODEL:-meta-llama/Llama-3.2-3B-Instruct} export SAFETY_MODEL=${SAFETY_MODEL:-meta-llama/Llama-Guard-3-1B} +# HF_TOKEN should be set by the user; base64 encode it for the secret +if [ -n "${HF_TOKEN:-}" ]; then + export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64) +else + echo "ERROR: HF_TOKEN not set. You need it for vLLM to download models from Hugging Face." + exit 1 +fi + set -euo pipefail set -x +# Apply the HF token secret if HF_TOKEN is provided +if [ -n "${HF_TOKEN:-}" ]; then + envsubst < ./hf-token-secret.yaml.template | kubectl apply -f - +fi + envsubst < ./vllm-k8s.yaml.template | kubectl apply -f - envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f - envsubst < ./postgres-k8s.yaml.template | kubectl apply -f - diff --git a/docs/source/distributions/k8s/hf-token-secret.yaml.template b/docs/source/distributions/k8s/hf-token-secret.yaml.template new file mode 100644 index 000000000..b6db8e7bc --- /dev/null +++ b/docs/source/distributions/k8s/hf-token-secret.yaml.template @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Secret +metadata: + name: hf-token-secret +type: Opaque +data: + token: ${HF_TOKEN_BASE64} diff --git a/docs/source/distributions/k8s/stack-configmap.yaml b/docs/source/distributions/k8s/stack-configmap.yaml index 0a08bca03..129471862 100644 --- a/docs/source/distributions/k8s/stack-configmap.yaml +++ b/docs/source/distributions/k8s/stack-configmap.yaml @@ -22,10 +22,10 @@ data: - provider_id: vllm-safety provider_type: remote::vllm config: - url: ${env.VLLM_SAFETY_URL:http://localhost:8000/v1} - max_tokens: ${env.VLLM_MAX_TOKENS:4096} - api_token: ${env.VLLM_API_TOKEN:fake} - tls_verify: ${env.VLLM_TLS_VERIFY:true} + url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1} + max_tokens: ${env.VLLM_MAX_TOKENS:=4096} + api_token: ${env.VLLM_API_TOKEN:=fake} + tls_verify: ${env.VLLM_TLS_VERIFY:=true} - provider_id: sentence-transformers provider_type: inline::sentence-transformers config: {} @@ -33,7 +33,7 @@ data: - provider_id: ${env.ENABLE_CHROMADB:+chromadb} provider_type: remote::chromadb config: - url: ${env.CHROMADB_URL:+} + url: ${env.CHROMADB_URL:=} safety: - provider_id: llama-guard provider_type: inline::llama-guard @@ -48,7 +48,7 @@ data: host: ${env.POSTGRES_HOST:=localhost} port: ${env.POSTGRES_PORT:=5432} db: ${env.POSTGRES_DB:=llamastack} - user: ${env.POSTGRES_USER:llamastack} + user: ${env.POSTGRES_USER:=llamastack} password: ${env.POSTGRES_PASSWORD:=llamastack} responses_store: type: postgres @@ -61,8 +61,8 @@ data: - provider_id: meta-reference provider_type: inline::meta-reference config: - service_name: ${env.OTEL_SERVICE_NAME:+} - sinks: ${env.TELEMETRY_SINKS:console} + service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" + sinks: ${env.TELEMETRY_SINKS:=console} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search diff --git a/docs/source/distributions/k8s/stack_run_config.yaml b/docs/source/distributions/k8s/stack_run_config.yaml index 5ac08134c..23993ca5d 100644 --- a/docs/source/distributions/k8s/stack_run_config.yaml +++ b/docs/source/distributions/k8s/stack_run_config.yaml @@ -30,7 +30,7 @@ providers: - provider_id: ${env.ENABLE_CHROMADB:+chromadb} provider_type: remote::chromadb config: - url: ${env.CHROMADB_URL:+} + url: ${env.CHROMADB_URL:=} safety: - provider_id: llama-guard provider_type: inline::llama-guard @@ -58,8 +58,8 @@ providers: - provider_id: meta-reference provider_type: inline::meta-reference config: - service_name: ${env.OTEL_SERVICE_NAME:+console} - sinks: ${env.TELEMETRY_SINKS:+console} + service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" + sinks: ${env.TELEMETRY_SINKS:=console} tool_runtime: - provider_id: brave-search provider_type: remote::brave-search diff --git a/docs/source/distributions/k8s/vllm-k8s.yaml.template b/docs/source/distributions/k8s/vllm-k8s.yaml.template index 6256cc7e1..03f3759c3 100644 --- a/docs/source/distributions/k8s/vllm-k8s.yaml.template +++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template @@ -25,16 +25,8 @@ spec: app.kubernetes.io/name: vllm workload-type: inference spec: - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: workload-type - operator: In - values: - - inference - topologyKey: kubernetes.io/hostname # Ensures no two inference pods on same node + nodeSelector: + eks.amazonaws.com/nodegroup: gpu containers: - name: vllm image: vllm/vllm-openai:latest @@ -42,6 +34,8 @@ spec: args: - "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.6" env: + - name: INFERENCE_MODEL + value: "${INFERENCE_MODEL}" - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: @@ -49,6 +43,11 @@ spec: key: token ports: - containerPort: 8000 + resources: + limits: + nvidia.com/gpu: 1 + requests: + nvidia.com/gpu: 1 volumeMounts: - name: llama-storage mountPath: /root/.cache/huggingface diff --git a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template index 8857e83b6..37b2b9a6b 100644 --- a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template +++ b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template @@ -6,7 +6,6 @@ spec: accessModes: - ReadWriteOnce volumeMode: Filesystem - storageClassName: gp2 resources: requests: storage: 30Gi @@ -26,16 +25,8 @@ spec: app.kubernetes.io/name: vllm-safety workload-type: inference spec: - affinity: - podAntiAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - - labelSelector: - matchExpressions: - - key: workload-type - operator: In - values: - - inference - topologyKey: kubernetes.io/hostname # Ensures no two inference pods on same node + nodeSelector: + eks.amazonaws.com/nodegroup: gpu containers: - name: vllm-safety image: vllm/vllm-openai:latest @@ -44,6 +35,8 @@ spec: "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.3" ] env: + - name: SAFETY_MODEL + value: "${SAFETY_MODEL}" - name: HUGGING_FACE_HUB_TOKEN valueFrom: secretKeyRef: @@ -51,6 +44,11 @@ spec: key: token ports: - containerPort: 8001 + resources: + limits: + nvidia.com/gpu: 1 + requests: + nvidia.com/gpu: 1 volumeMounts: - name: llama-storage mountPath: /root/.cache/huggingface diff --git a/docs/source/distributions/kubernetes_deployment.md b/docs/source/distributions/kubernetes_deployment.md index f43039824..3a8dccd07 100644 --- a/docs/source/distributions/kubernetes_deployment.md +++ b/docs/source/distributions/kubernetes_deployment.md @@ -5,6 +5,8 @@ Instead of starting the Llama Stack and vLLM servers locally. We can deploy them ### Prerequisites In this guide, we'll use a local [Kind](https://kind.sigs.k8s.io/) cluster and a vLLM inference service in the same cluster for demonstration purposes. +Note: You can also deploy the Llama Stack server in an AWS EKS cluster. See [Deploying Llama Stack Server in AWS EKS](#deploying-llama-stack-server-in-aws-eks) for more details. + First, create a local Kubernetes cluster via Kind: ``` @@ -217,3 +219,18 @@ Finally, we forward the Kubernetes service to a local port and test some inferen kubectl port-forward service/llama-stack-service 5000:5000 llama-stack-client --endpoint http://localhost:5000 inference chat-completion --message "hello, what model are you?" ``` + +## Deploying Llama Stack Server in AWS EKS + +We've also provided a script to deploy the Llama Stack server in an AWS EKS cluster. Once you have an [EKS cluster](https://docs.aws.amazon.com/eks/latest/userguide/getting-started.html), you can run the following script to deploy the Llama Stack server. + + +``` +cd docs/source/distributions/eks +./apply.sh +``` + +This script will: + +- Set up a default storage class for AWS EKS +- Deploy the Llama Stack server in a Kubernetes Pod and Service \ No newline at end of file