From 6cbb3366f2dd5a4fcc2d34a7eb4b0103ed8cc1ab Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Sun, 1 Jun 2025 17:07:18 -0700 Subject: [PATCH] more fixes, gah --- .../distributions/k8s/stack-k8s.yaml.template | 2 ++ .../source/distributions/k8s/vllm-k8s.yaml.template | 13 +++++++++++++ .../distributions/k8s/vllm-safety-k8s.yaml.template | 2 +- 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/docs/source/distributions/k8s/stack-k8s.yaml.template b/docs/source/distributions/k8s/stack-k8s.yaml.template index 8a655a3b5..a076a7a1d 100644 --- a/docs/source/distributions/k8s/stack-k8s.yaml.template +++ b/docs/source/distributions/k8s/stack-k8s.yaml.template @@ -31,6 +31,8 @@ spec: env: - name: ENABLE_CHROMADB value: "true" + - name: CHROMADB_URL + value: http://chromadb.default.svc.cluster.local:6000 - name: VLLM_URL value: http://vllm-server.default.svc.cluster.local:8000/v1 - name: VLLM_MAX_TOKENS diff --git a/docs/source/distributions/k8s/vllm-k8s.yaml.template b/docs/source/distributions/k8s/vllm-k8s.yaml.template index 0be4aba0d..c568d55d2 100644 --- a/docs/source/distributions/k8s/vllm-k8s.yaml.template +++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template @@ -57,3 +57,16 @@ spec: - name: llama-storage persistentVolumeClaim: claimName: vllm-models +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-server +spec: + selector: + app.kubernetes.io/name: vllm + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + type: ClusterIP diff --git a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template index 8b948d94d..8857e83b6 100644 --- a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template +++ b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template @@ -41,7 +41,7 @@ spec: image: vllm/vllm-openai:latest command: ["/bin/sh", "-c"] args: [ - "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.3" + "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.3" ] env: - name: HUGGING_FACE_HUB_TOKEN