From 4121166784f1c424b49832433929ae853ebda83a Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Sun, 1 Jun 2025 15:59:00 -0700 Subject: [PATCH] split off safety so it can be applied one at a time --- docs/source/distributions/k8s/apply.sh | 7 ++ .../k8s/chroma-k8s.yaml.template | 2 +- .../distributions/k8s/stack-k8s.yaml.template | 4 ++ .../distributions/k8s/vllm-k8s.yaml.template | 65 +------------------ .../k8s/vllm-safety-k8s.yaml.template | 49 ++++++++++++++ 5 files changed, 62 insertions(+), 65 deletions(-) mode change 100644 => 100755 docs/source/distributions/k8s/apply.sh create mode 100644 docs/source/distributions/k8s/vllm-safety-k8s.yaml.template diff --git a/docs/source/distributions/k8s/apply.sh b/docs/source/distributions/k8s/apply.sh old mode 100644 new mode 100755 index c8fe2d68a..061d9d0da --- a/docs/source/distributions/k8s/apply.sh +++ b/docs/source/distributions/k8s/apply.sh @@ -1,5 +1,11 @@ #!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the terms described in the LICENSE file in +# the root directory of this source tree. + export POSTGRES_USER=${POSTGRES_USER:-llamastack} export POSTGRES_DB=${POSTGRES_DB:-llamastack} export POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-llamastack} @@ -11,6 +17,7 @@ set -euo pipefail set -x envsubst < ./vllm-k8s.yaml.template | kubectl apply -f - +envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f - envsubst < ./postgres-k8s.yaml.template | kubectl apply -f - envsubst < ./chroma-k8s.yaml.template | kubectl apply -f - diff --git a/docs/source/distributions/k8s/chroma-k8s.yaml.template b/docs/source/distributions/k8s/chroma-k8s.yaml.template index c6733c121..dd7f9a8c1 100644 --- a/docs/source/distributions/k8s/chroma-k8s.yaml.template +++ b/docs/source/distributions/k8s/chroma-k8s.yaml.template @@ -5,7 +5,7 @@ metadata: spec: accessModes: - ReadWriteOnce - storageClassName: gp2 + storageClassName: gp2 resources: requests: storage: 20Gi diff --git a/docs/source/distributions/k8s/stack-k8s.yaml.template b/docs/source/distributions/k8s/stack-k8s.yaml.template index 3e7df0084..8a655a3b5 100644 --- a/docs/source/distributions/k8s/stack-k8s.yaml.template +++ b/docs/source/distributions/k8s/stack-k8s.yaml.template @@ -29,8 +29,12 @@ spec: image: llamastack/distribution-remote-vllm:latest imagePullPolicy: Always # since we have specified latest instead of a version env: + - name: ENABLE_CHROMADB + value: "true" - name: VLLM_URL value: http://vllm-server.default.svc.cluster.local:8000/v1 + - name: VLLM_MAX_TOKENS + value: "3072" - name: VLLM_SAFETY_URL value: http://vllm-server-safety.default.svc.cluster.local:8001/v1 - name: POSTGRES_HOST diff --git a/docs/source/distributions/k8s/vllm-k8s.yaml.template b/docs/source/distributions/k8s/vllm-k8s.yaml.template index da153a65d..4af13f563 100644 --- a/docs/source/distributions/k8s/vllm-k8s.yaml.template +++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template @@ -32,7 +32,7 @@ spec: image: vllm/vllm-openai:latest command: ["/bin/sh", "-c"] args: - - "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 6144 --gpu-memory-utilization 0.55" + - "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.5" env: - name: HUGGING_FACE_HUB_TOKEN valueFrom: @@ -48,66 +48,3 @@ spec: - name: llama-storage persistentVolumeClaim: claimName: vllm-models ---- -apiVersion: v1 -kind: Service -metadata: - name: vllm-server -spec: - selector: - app.kubernetes.io/name: vllm - ports: - - protocol: TCP - port: 8000 - targetPort: 8000 - type: ClusterIP ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: vllm-server-safety -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: vllm-safety - template: - metadata: - labels: - app.kubernetes.io/name: vllm-safety - spec: - containers: - - name: vllm-safety - image: vllm/vllm-openai:latest - command: ["/bin/sh", "-c"] - args: [ - "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 6144 --gpu-memory-utilization 0.28" - ] - env: - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - ports: - - containerPort: 8001 - volumeMounts: - - name: llama-storage - mountPath: /root/.cache/huggingface - volumes: - - name: llama-storage - persistentVolumeClaim: - claimName: vllm-models ---- -apiVersion: v1 -kind: Service -metadata: - name: vllm-server-safety -spec: - selector: - app.kubernetes.io/name: vllm-safety - ports: - - protocol: TCP - port: 8001 - targetPort: 8001 - type: ClusterIP diff --git a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template new file mode 100644 index 000000000..26fc9ee37 --- /dev/null +++ b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template @@ -0,0 +1,49 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-server-safety +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: vllm-safety + template: + metadata: + labels: + app.kubernetes.io/name: vllm-safety + spec: + containers: + - name: vllm-safety + image: vllm/vllm-openai:latest + command: ["/bin/sh", "-c"] + args: [ + "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.28" + ] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8001 + volumeMounts: + - name: llama-storage + mountPath: /root/.cache/huggingface + volumes: + - name: llama-storage + persistentVolumeClaim: + claimName: vllm-models +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-server-safety +spec: + selector: + app.kubernetes.io/name: vllm-safety + ports: + - protocol: TCP + port: 8001 + targetPort: 8001 + type: ClusterIP