diff --git a/docs/docs/distributions/eks/apply.sh b/docs/docs/distributions/eks/apply.sh new file mode 100755 index 000000000..3ad3dd263 --- /dev/null +++ b/docs/docs/distributions/eks/apply.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +set -euo pipefail + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +K8S_DIR="${SCRIPT_DIR}/../k8s" + +echo "Setting up AWS EKS-specific storage class..." +kubectl apply -f gp3-topology-aware.yaml + +echo "Running main Kubernetes deployment..." +cd "${K8S_DIR}" +./apply.sh "$@" diff --git a/docs/docs/distributions/eks/gp3-topology-aware.yaml b/docs/docs/distributions/eks/gp3-topology-aware.yaml new file mode 100644 index 000000000..1192ba18c --- /dev/null +++ b/docs/docs/distributions/eks/gp3-topology-aware.yaml @@ -0,0 +1,15 @@ +# Set up default storage class on AWS EKS +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: gp3-topology-aware + annotations: + storageclass.kubernetes.io/is-default-class: "true" +parameters: + type: gp3 + iops: "3000" + throughput: "125" +provisioner: ebs.csi.aws.com +reclaimPolicy: Delete +volumeBindingMode: WaitForFirstConsumer +allowVolumeExpansion: true diff --git a/docs/docs/distributions/k8s/apply.sh b/docs/docs/distributions/k8s/apply.sh new file mode 100755 index 000000000..1b5b26863 --- /dev/null +++ b/docs/docs/distributions/k8s/apply.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + +export POSTGRES_USER=llamastack +export POSTGRES_DB=llamastack +export POSTGRES_PASSWORD=llamastack + +export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct +export SAFETY_MODEL=meta-llama/Llama-Guard-3-1B + +# HF_TOKEN should be set by the user; base64 encode it for the secret +if [ -n "${HF_TOKEN:-}" ]; then + export HF_TOKEN_BASE64=$(echo -n "$HF_TOKEN" | base64) +else + echo "ERROR: HF_TOKEN not set. You need it for vLLM to download models from Hugging Face." + exit 1 +fi + +if [ -z "${GITHUB_CLIENT_ID:-}" ]; then + echo "ERROR: GITHUB_CLIENT_ID not set. You need it for Github login to work. See the Kubernetes Deployment Guide in the Llama Stack documentation." + exit 1 +fi + +if [ -z "${GITHUB_CLIENT_SECRET:-}" ]; then + echo "ERROR: GITHUB_CLIENT_SECRET not set. You need it for Github login to work. See the Kubernetes Deployment Guide in the Llama Stack documentation." + exit 1 +fi + +if [ -z "${LLAMA_STACK_UI_URL:-}" ]; then + echo "ERROR: LLAMA_STACK_UI_URL not set. Should be set to the external URL of the UI (excluding port). You need it for Github login to work. See the Kubernetes Deployment Guide in the Llama Stack documentation." + exit 1 +fi + + + + +set -euo pipefail +set -x + +# Apply the HF token secret if HF_TOKEN is provided +if [ -n "${HF_TOKEN:-}" ]; then + envsubst < ./hf-token-secret.yaml.template | kubectl apply -f - +fi + +envsubst < ./vllm-k8s.yaml.template | kubectl apply -f - +envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f - +envsubst < ./postgres-k8s.yaml.template | kubectl apply -f - +envsubst < ./chroma-k8s.yaml.template | kubectl apply -f - + +kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \ + --dry-run=client -o yaml > stack-configmap.yaml + +kubectl apply -f stack-configmap.yaml + +envsubst < ./stack-k8s.yaml.template | kubectl apply -f - +envsubst < ./ingress-k8s.yaml.template | kubectl apply -f - + +envsubst < ./ui-k8s.yaml.template | kubectl apply -f - diff --git a/docs/docs/distributions/k8s/chroma-k8s.yaml.template b/docs/docs/distributions/k8s/chroma-k8s.yaml.template new file mode 100644 index 000000000..a2a5e3be3 --- /dev/null +++ b/docs/docs/distributions/k8s/chroma-k8s.yaml.template @@ -0,0 +1,66 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: chromadb-pvc +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 20Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chromadb +spec: + replicas: 1 + selector: + matchLabels: + app: chromadb + template: + metadata: + labels: + app: chromadb + spec: + containers: + - name: chromadb + image: chromadb/chroma:latest + ports: + - containerPort: 6000 + env: + - name: CHROMA_HOST + value: "0.0.0.0" + - name: CHROMA_PORT + value: "6000" + - name: PERSIST_DIRECTORY + value: "/chroma/chroma" + - name: CHROMA_DB_IMPL + value: "duckdb+parquet" + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "2Gi" + cpu: "1000m" + volumeMounts: + - name: chromadb-storage + mountPath: /chroma/chroma + volumes: + - name: chromadb-storage + persistentVolumeClaim: + claimName: chromadb-pvc +--- +apiVersion: v1 +kind: Service +metadata: + name: chromadb +spec: + selector: + app: chromadb + ports: + - protocol: TCP + port: 6000 + targetPort: 6000 + type: ClusterIP diff --git a/docs/docs/distributions/k8s/hf-token-secret.yaml.template b/docs/docs/distributions/k8s/hf-token-secret.yaml.template new file mode 100644 index 000000000..b6db8e7bc --- /dev/null +++ b/docs/docs/distributions/k8s/hf-token-secret.yaml.template @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Secret +metadata: + name: hf-token-secret +type: Opaque +data: + token: ${HF_TOKEN_BASE64} diff --git a/docs/docs/distributions/k8s/ingress-k8s.yaml.template b/docs/docs/distributions/k8s/ingress-k8s.yaml.template new file mode 100644 index 000000000..9ebe86b69 --- /dev/null +++ b/docs/docs/distributions/k8s/ingress-k8s.yaml.template @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: llama-stack-service +spec: + type: LoadBalancer + selector: + app.kubernetes.io/name: llama-stack + ports: + - name: llama-stack-api + port: 8321 + targetPort: 8321 + protocol: TCP + - name: llama-stack-ui + port: 8322 + targetPort: 8322 + protocol: TCP diff --git a/docs/docs/distributions/k8s/postgres-k8s.yaml.template b/docs/docs/distributions/k8s/postgres-k8s.yaml.template new file mode 100644 index 000000000..86a765652 --- /dev/null +++ b/docs/docs/distributions/k8s/postgres-k8s.yaml.template @@ -0,0 +1,66 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: postgres-pvc +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: postgres +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: postgres + template: + metadata: + labels: + app.kubernetes.io/name: postgres + spec: + containers: + - name: postgres + image: postgres:15 + env: + - name: POSTGRES_DB + value: "${POSTGRES_DB}" + - name: POSTGRES_USER + value: "${POSTGRES_USER}" + - name: POSTGRES_PASSWORD + value: "${POSTGRES_PASSWORD}" + - name: PGDATA + value: "/var/lib/postgresql/data/pgdata" + ports: + - containerPort: 5432 + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "1Gi" + cpu: "500m" + volumeMounts: + - name: postgres-storage + mountPath: /var/lib/postgresql/data + volumes: + - name: postgres-storage + persistentVolumeClaim: + claimName: postgres-pvc +--- +apiVersion: v1 +kind: Service +metadata: + name: postgres-server +spec: + selector: + app.kubernetes.io/name: postgres + ports: + - protocol: TCP + port: 5432 + targetPort: 5432 + type: ClusterIP diff --git a/docs/docs/distributions/k8s/stack-configmap.yaml b/docs/docs/distributions/k8s/stack-configmap.yaml new file mode 100644 index 000000000..3dbb0da97 --- /dev/null +++ b/docs/docs/distributions/k8s/stack-configmap.yaml @@ -0,0 +1,56 @@ +apiVersion: v1 +data: + stack_run_config.yaml: "version: '2'\nimage_name: kubernetes-demo\napis:\n- agents\n- + inference\n- files\n- safety\n- telemetry\n- tool_runtime\n- vector_io\nproviders:\n + \ inference:\n - provider_id: vllm-inference\n provider_type: remote::vllm\n + \ config:\n url: ${env.VLLM_URL:=http://localhost:8000/v1}\n max_tokens: + ${env.VLLM_MAX_TOKENS:=4096}\n api_token: ${env.VLLM_API_TOKEN:=fake}\n tls_verify: + ${env.VLLM_TLS_VERIFY:=true}\n - provider_id: vllm-safety\n provider_type: + remote::vllm\n config:\n url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1}\n + \ max_tokens: ${env.VLLM_MAX_TOKENS:=4096}\n api_token: ${env.VLLM_API_TOKEN:=fake}\n + \ tls_verify: ${env.VLLM_TLS_VERIFY:=true}\n - provider_id: sentence-transformers\n + \ provider_type: inline::sentence-transformers\n config: {}\n vector_io:\n + \ - provider_id: ${env.ENABLE_CHROMADB:+chromadb}\n provider_type: remote::chromadb\n + \ config:\n url: ${env.CHROMADB_URL:=}\n kvstore:\n type: postgres\n + \ host: ${env.POSTGRES_HOST:=localhost}\n port: ${env.POSTGRES_PORT:=5432}\n + \ db: ${env.POSTGRES_DB:=llamastack}\n user: ${env.POSTGRES_USER:=llamastack}\n + \ password: ${env.POSTGRES_PASSWORD:=llamastack}\n files:\n - provider_id: + meta-reference-files\n provider_type: inline::localfs\n config:\n storage_dir: + ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files}\n metadata_store:\n + \ type: sqlite\n db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db + \ \n safety:\n - provider_id: llama-guard\n provider_type: inline::llama-guard\n + \ config:\n excluded_categories: []\n agents:\n - provider_id: meta-reference\n + \ provider_type: inline::meta-reference\n config:\n persistence_store:\n + \ type: postgres\n host: ${env.POSTGRES_HOST:=localhost}\n port: + ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n user: + ${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\n + \ responses_store:\n type: postgres\n host: ${env.POSTGRES_HOST:=localhost}\n + \ port: ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n + \ user: ${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\n + \ telemetry:\n - provider_id: meta-reference\n provider_type: inline::meta-reference\n + \ config:\n service_name: \"${env.OTEL_SERVICE_NAME:=\\u200B}\"\n sinks: + ${env.TELEMETRY_SINKS:=console}\n tool_runtime:\n - provider_id: brave-search\n + \ provider_type: remote::brave-search\n config:\n api_key: ${env.BRAVE_SEARCH_API_KEY:+}\n + \ max_results: 3\n - provider_id: tavily-search\n provider_type: remote::tavily-search\n + \ config:\n api_key: ${env.TAVILY_SEARCH_API_KEY:+}\n max_results: + 3\n - provider_id: rag-runtime\n provider_type: inline::rag-runtime\n config: + {}\n - provider_id: model-context-protocol\n provider_type: remote::model-context-protocol\n + \ config: {}\nmetadata_store:\n type: postgres\n host: ${env.POSTGRES_HOST:=localhost}\n + \ port: ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n user: + ${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\n + \ table_name: llamastack_kvstore\ninference_store:\n type: postgres\n host: + ${env.POSTGRES_HOST:=localhost}\n port: ${env.POSTGRES_PORT:=5432}\n db: ${env.POSTGRES_DB:=llamastack}\n + \ user: ${env.POSTGRES_USER:=llamastack}\n password: ${env.POSTGRES_PASSWORD:=llamastack}\nmodels:\n- + metadata:\n embedding_dimension: 384\n model_id: all-MiniLM-L6-v2\n provider_id: + sentence-transformers\n model_type: embedding\n- metadata: {}\n model_id: ${env.INFERENCE_MODEL}\n + \ provider_id: vllm-inference\n model_type: llm\n- metadata: {}\n model_id: + ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\n provider_id: vllm-safety\n + \ model_type: llm\nshields:\n- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B}\nvector_dbs: + []\ndatasets: []\nscoring_fns: []\nbenchmarks: []\ntool_groups:\n- toolgroup_id: + builtin::websearch\n provider_id: tavily-search\n- toolgroup_id: builtin::rag\n + \ provider_id: rag-runtime\nserver:\n port: 8321\n auth:\n provider_config:\n + \ type: github_token\n" +kind: ConfigMap +metadata: + creationTimestamp: null + name: llama-stack-config diff --git a/docs/docs/distributions/k8s/stack-k8s.yaml.template b/docs/docs/distributions/k8s/stack-k8s.yaml.template new file mode 100644 index 000000000..dfc049f4f --- /dev/null +++ b/docs/docs/distributions/k8s/stack-k8s.yaml.template @@ -0,0 +1,69 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: llama-pvc +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llama-stack-server +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: llama-stack + app.kubernetes.io/component: server + template: + metadata: + labels: + app.kubernetes.io/name: llama-stack + app.kubernetes.io/component: server + spec: + containers: + - name: llama-stack + image: llamastack/distribution-starter:latest + imagePullPolicy: Always # since we have specified latest instead of a version + env: + - name: ENABLE_CHROMADB + value: "true" + - name: CHROMADB_URL + value: http://chromadb.default.svc.cluster.local:6000 + - name: VLLM_URL + value: http://vllm-server.default.svc.cluster.local:8000/v1 + - name: VLLM_MAX_TOKENS + value: "3072" + - name: VLLM_SAFETY_URL + value: http://vllm-server-safety.default.svc.cluster.local:8001/v1 + - name: VLLM_TLS_VERIFY + value: "false" + - name: POSTGRES_HOST + value: postgres-server.default.svc.cluster.local + - name: POSTGRES_PORT + value: "5432" + - name: INFERENCE_MODEL + value: "${INFERENCE_MODEL}" + - name: SAFETY_MODEL + value: "${SAFETY_MODEL}" + - name: TAVILY_SEARCH_API_KEY + value: "${TAVILY_SEARCH_API_KEY}" + command: ["python", "-m", "llama_stack.core.server.server", "/etc/config/stack_run_config.yaml", "--port", "8321"] + ports: + - containerPort: 8321 + volumeMounts: + - name: llama-storage + mountPath: /root/.llama + - name: llama-config + mountPath: /etc/config + volumes: + - name: llama-storage + persistentVolumeClaim: + claimName: llama-pvc + - name: llama-config + configMap: + name: llama-stack-config diff --git a/docs/docs/distributions/k8s/stack_run_config.yaml b/docs/docs/distributions/k8s/stack_run_config.yaml new file mode 100644 index 000000000..b841ab977 --- /dev/null +++ b/docs/docs/distributions/k8s/stack_run_config.yaml @@ -0,0 +1,140 @@ +version: '2' +image_name: kubernetes-demo +apis: +- agents +- inference +- files +- safety +- telemetry +- tool_runtime +- vector_io +providers: + inference: + - provider_id: vllm-inference + provider_type: remote::vllm + config: + url: ${env.VLLM_URL:=http://localhost:8000/v1} + max_tokens: ${env.VLLM_MAX_TOKENS:=4096} + api_token: ${env.VLLM_API_TOKEN:=fake} + tls_verify: ${env.VLLM_TLS_VERIFY:=true} + - provider_id: vllm-safety + provider_type: remote::vllm + config: + url: ${env.VLLM_SAFETY_URL:=http://localhost:8000/v1} + max_tokens: ${env.VLLM_MAX_TOKENS:=4096} + api_token: ${env.VLLM_API_TOKEN:=fake} + tls_verify: ${env.VLLM_TLS_VERIFY:=true} + - provider_id: sentence-transformers + provider_type: inline::sentence-transformers + config: {} + vector_io: + - provider_id: ${env.ENABLE_CHROMADB:+chromadb} + provider_type: remote::chromadb + config: + url: ${env.CHROMADB_URL:=} + kvstore: + type: postgres + host: ${env.POSTGRES_HOST:=localhost} + port: ${env.POSTGRES_PORT:=5432} + db: ${env.POSTGRES_DB:=llamastack} + user: ${env.POSTGRES_USER:=llamastack} + password: ${env.POSTGRES_PASSWORD:=llamastack} + files: + - provider_id: meta-reference-files + provider_type: inline::localfs + config: + storage_dir: ${env.FILES_STORAGE_DIR:=~/.llama/distributions/starter/files} + metadata_store: + type: sqlite + db_path: ${env.SQLITE_STORE_DIR:=~/.llama/distributions/starter}/files_metadata.db + safety: + - provider_id: llama-guard + provider_type: inline::llama-guard + config: + excluded_categories: [] + agents: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + persistence_store: + type: postgres + host: ${env.POSTGRES_HOST:=localhost} + port: ${env.POSTGRES_PORT:=5432} + db: ${env.POSTGRES_DB:=llamastack} + user: ${env.POSTGRES_USER:=llamastack} + password: ${env.POSTGRES_PASSWORD:=llamastack} + responses_store: + type: postgres + host: ${env.POSTGRES_HOST:=localhost} + port: ${env.POSTGRES_PORT:=5432} + db: ${env.POSTGRES_DB:=llamastack} + user: ${env.POSTGRES_USER:=llamastack} + password: ${env.POSTGRES_PASSWORD:=llamastack} + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" + sinks: ${env.TELEMETRY_SINKS:=console} + tool_runtime: + - provider_id: brave-search + provider_type: remote::brave-search + config: + api_key: ${env.BRAVE_SEARCH_API_KEY:+} + max_results: 3 + - provider_id: tavily-search + provider_type: remote::tavily-search + config: + api_key: ${env.TAVILY_SEARCH_API_KEY:+} + max_results: 3 + - provider_id: rag-runtime + provider_type: inline::rag-runtime + config: {} + - provider_id: model-context-protocol + provider_type: remote::model-context-protocol + config: {} +metadata_store: + type: postgres + host: ${env.POSTGRES_HOST:=localhost} + port: ${env.POSTGRES_PORT:=5432} + db: ${env.POSTGRES_DB:=llamastack} + user: ${env.POSTGRES_USER:=llamastack} + password: ${env.POSTGRES_PASSWORD:=llamastack} + table_name: llamastack_kvstore +inference_store: + type: postgres + host: ${env.POSTGRES_HOST:=localhost} + port: ${env.POSTGRES_PORT:=5432} + db: ${env.POSTGRES_DB:=llamastack} + user: ${env.POSTGRES_USER:=llamastack} + password: ${env.POSTGRES_PASSWORD:=llamastack} +models: +- metadata: + embedding_dimension: 384 + model_id: all-MiniLM-L6-v2 + provider_id: sentence-transformers + model_type: embedding +- metadata: {} + model_id: ${env.INFERENCE_MODEL} + provider_id: vllm-inference + model_type: llm +- metadata: {} + model_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B} + provider_id: vllm-safety + model_type: llm +shields: +- shield_id: ${env.SAFETY_MODEL:=meta-llama/Llama-Guard-3-1B} +vector_dbs: [] +datasets: [] +scoring_fns: [] +benchmarks: [] +tool_groups: +- toolgroup_id: builtin::websearch + provider_id: tavily-search +- toolgroup_id: builtin::rag + provider_id: rag-runtime +server: + port: 8321 + auth: + provider_config: + type: github_token diff --git a/docs/docs/distributions/k8s/ui-k8s.yaml.template b/docs/docs/distributions/k8s/ui-k8s.yaml.template new file mode 100644 index 000000000..a6859cb86 --- /dev/null +++ b/docs/docs/distributions/k8s/ui-k8s.yaml.template @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llama-stack-ui + labels: + app.kubernetes.io/name: llama-stack + app.kubernetes.io/component: ui +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: llama-stack + app.kubernetes.io/component: ui + template: + metadata: + labels: + app.kubernetes.io/name: llama-stack + app.kubernetes.io/component: ui + spec: + containers: + - name: llama-stack-ui + image: node:18-alpine + command: ["/bin/sh"] + env: + - name: LLAMA_STACK_BACKEND_URL + value: "http://llama-stack-service:8321" + - name: LLAMA_STACK_UI_PORT + value: "8322" + - name: GITHUB_CLIENT_ID + value: "${GITHUB_CLIENT_ID}" + - name: GITHUB_CLIENT_SECRET + value: "${GITHUB_CLIENT_SECRET}" + - name: NEXTAUTH_URL + value: "${LLAMA_STACK_UI_URL}:8322" + args: + - -c + - | + # Install git (not included in alpine by default) + apk add --no-cache git + + # Clone the repository + echo "Cloning repository..." + git clone https://github.com/meta-llama/llama-stack.git /app + + # Navigate to the UI directory + echo "Navigating to UI directory..." + cd /app/llama_stack/ui + + # Check if package.json exists + if [ ! -f "package.json" ]; then + echo "ERROR: package.json not found in $(pwd)" + ls -la + exit 1 + fi + + # Install dependencies with verbose output + echo "Installing dependencies..." + npm install --verbose + + # Verify next is installed + echo "Checking if next is installed..." + npx next --version || echo "Next.js not found, checking node_modules..." + ls -la node_modules/.bin/ | grep next || echo "No next binary found" + + npm run dev + ports: + - containerPort: 8322 + workingDir: /app diff --git a/docs/docs/distributions/k8s/vllm-k8s.yaml.template b/docs/docs/distributions/k8s/vllm-k8s.yaml.template new file mode 100644 index 000000000..22bee4bbc --- /dev/null +++ b/docs/docs/distributions/k8s/vllm-k8s.yaml.template @@ -0,0 +1,70 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: vllm-models +spec: + accessModes: + - ReadWriteOnce + volumeMode: Filesystem + resources: + requests: + storage: 50Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-server +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: vllm + template: + metadata: + labels: + app.kubernetes.io/name: vllm + workload-type: inference + spec: + nodeSelector: + eks.amazonaws.com/nodegroup: gpu + containers: + - name: vllm + image: vllm/vllm-openai:latest + command: ["/bin/sh", "-c"] + args: + - "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.6 --enable-auto-tool-choice --tool-call-parser llama4_pythonic" + env: + - name: INFERENCE_MODEL + value: "${INFERENCE_MODEL}" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8000 + resources: + limits: + nvidia.com/gpu: 1 + requests: + nvidia.com/gpu: 1 + volumeMounts: + - name: llama-storage + mountPath: /root/.cache/huggingface + volumes: + - name: llama-storage + persistentVolumeClaim: + claimName: vllm-models +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-server +spec: + selector: + app.kubernetes.io/name: vllm + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + type: ClusterIP diff --git a/docs/docs/distributions/k8s/vllm-safety-k8s.yaml.template b/docs/docs/distributions/k8s/vllm-safety-k8s.yaml.template new file mode 100644 index 000000000..37b2b9a6b --- /dev/null +++ b/docs/docs/distributions/k8s/vllm-safety-k8s.yaml.template @@ -0,0 +1,71 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: vllm-models-safety +spec: + accessModes: + - ReadWriteOnce + volumeMode: Filesystem + resources: + requests: + storage: 30Gi +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-server-safety +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: vllm-safety + template: + metadata: + labels: + app.kubernetes.io/name: vllm-safety + workload-type: inference + spec: + nodeSelector: + eks.amazonaws.com/nodegroup: gpu + containers: + - name: vllm-safety + image: vllm/vllm-openai:latest + command: ["/bin/sh", "-c"] + args: [ + "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.3" + ] + env: + - name: SAFETY_MODEL + value: "${SAFETY_MODEL}" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8001 + resources: + limits: + nvidia.com/gpu: 1 + requests: + nvidia.com/gpu: 1 + volumeMounts: + - name: llama-storage + mountPath: /root/.cache/huggingface + volumes: + - name: llama-storage + persistentVolumeClaim: + claimName: vllm-models-safety +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-server-safety +spec: + selector: + app.kubernetes.io/name: vllm-safety + ports: + - protocol: TCP + port: 8001 + targetPort: 8001 + type: ClusterIP