split off safety so it can be applied one at a time

This commit is contained in:
Ashwin Bharambe 2025-06-01 15:59:00 -07:00
parent d93f6c9e5b
commit 4121166784
5 changed files with 62 additions and 65 deletions

7
docs/source/distributions/k8s/apply.sh Normal file → Executable file
View file

@ -1,5 +1,11 @@
#!/bin/bash #!/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
export POSTGRES_USER=${POSTGRES_USER:-llamastack} export POSTGRES_USER=${POSTGRES_USER:-llamastack}
export POSTGRES_DB=${POSTGRES_DB:-llamastack} export POSTGRES_DB=${POSTGRES_DB:-llamastack}
export POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-llamastack} export POSTGRES_PASSWORD=${POSTGRES_PASSWORD:-llamastack}
@ -11,6 +17,7 @@ set -euo pipefail
set -x set -x
envsubst < ./vllm-k8s.yaml.template | kubectl apply -f - envsubst < ./vllm-k8s.yaml.template | kubectl apply -f -
envsubst < ./vllm-safety-k8s.yaml.template | kubectl apply -f -
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f - envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
envsubst < ./chroma-k8s.yaml.template | kubectl apply -f - envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -

View file

@ -5,7 +5,7 @@ metadata:
spec: spec:
accessModes: accessModes:
- ReadWriteOnce - ReadWriteOnce
storageClassName: gp2 storageClassName: gp2
resources: resources:
requests: requests:
storage: 20Gi storage: 20Gi

View file

@ -29,8 +29,12 @@ spec:
image: llamastack/distribution-remote-vllm:latest image: llamastack/distribution-remote-vllm:latest
imagePullPolicy: Always # since we have specified latest instead of a version imagePullPolicy: Always # since we have specified latest instead of a version
env: env:
- name: ENABLE_CHROMADB
value: "true"
- name: VLLM_URL - name: VLLM_URL
value: http://vllm-server.default.svc.cluster.local:8000/v1 value: http://vllm-server.default.svc.cluster.local:8000/v1
- name: VLLM_MAX_TOKENS
value: "3072"
- name: VLLM_SAFETY_URL - name: VLLM_SAFETY_URL
value: http://vllm-server-safety.default.svc.cluster.local:8001/v1 value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
- name: POSTGRES_HOST - name: POSTGRES_HOST

View file

@ -32,7 +32,7 @@ spec:
image: vllm/vllm-openai:latest image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"] command: ["/bin/sh", "-c"]
args: args:
- "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 6144 --gpu-memory-utilization 0.55" - "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.5"
env: env:
- name: HUGGING_FACE_HUB_TOKEN - name: HUGGING_FACE_HUB_TOKEN
valueFrom: valueFrom:
@ -48,66 +48,3 @@ spec:
- name: llama-storage - name: llama-storage
persistentVolumeClaim: persistentVolumeClaim:
claimName: vllm-models claimName: vllm-models
---
apiVersion: v1
kind: Service
metadata:
name: vllm-server
spec:
selector:
app.kubernetes.io/name: vllm
ports:
- protocol: TCP
port: 8000
targetPort: 8000
type: ClusterIP
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-server-safety
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: vllm-safety
template:
metadata:
labels:
app.kubernetes.io/name: vllm-safety
spec:
containers:
- name: vllm-safety
image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"]
args: [
"vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 6144 --gpu-memory-utilization 0.28"
]
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
ports:
- containerPort: 8001
volumeMounts:
- name: llama-storage
mountPath: /root/.cache/huggingface
volumes:
- name: llama-storage
persistentVolumeClaim:
claimName: vllm-models
---
apiVersion: v1
kind: Service
metadata:
name: vllm-server-safety
spec:
selector:
app.kubernetes.io/name: vllm-safety
ports:
- protocol: TCP
port: 8001
targetPort: 8001
type: ClusterIP

View file

@ -0,0 +1,49 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-server-safety
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: vllm-safety
template:
metadata:
labels:
app.kubernetes.io/name: vllm-safety
spec:
containers:
- name: vllm-safety
image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"]
args: [
"vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.28"
]
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
ports:
- containerPort: 8001
volumeMounts:
- name: llama-storage
mountPath: /root/.cache/huggingface
volumes:
- name: llama-storage
persistentVolumeClaim:
claimName: vllm-models
---
apiVersion: v1
kind: Service
metadata:
name: vllm-server-safety
spec:
selector:
app.kubernetes.io/name: vllm-safety
ports:
- protocol: TCP
port: 8001
targetPort: 8001
type: ClusterIP