diff --git a/docs/source/distributions/k8s/stack-k8s.yaml.template b/docs/source/distributions/k8s/stack-k8s.yaml.template index 8a655a3b5..a076a7a1d 100644 --- a/docs/source/distributions/k8s/stack-k8s.yaml.template +++ b/docs/source/distributions/k8s/stack-k8s.yaml.template @@ -31,6 +31,8 @@ spec: env: - name: ENABLE_CHROMADB value: "true" + - name: CHROMADB_URL + value: http://chromadb.default.svc.cluster.local:6000 - name: VLLM_URL value: http://vllm-server.default.svc.cluster.local:8000/v1 - name: VLLM_MAX_TOKENS diff --git a/docs/source/distributions/k8s/vllm-k8s.yaml.template b/docs/source/distributions/k8s/vllm-k8s.yaml.template index 0be4aba0d..c568d55d2 100644 --- a/docs/source/distributions/k8s/vllm-k8s.yaml.template +++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template @@ -57,3 +57,16 @@ spec: - name: llama-storage persistentVolumeClaim: claimName: vllm-models +--- +apiVersion: v1 +kind: Service +metadata: + name: vllm-server +spec: + selector: + app.kubernetes.io/name: vllm + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 + type: ClusterIP diff --git a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template index 8b948d94d..8857e83b6 100644 --- a/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template +++ b/docs/source/distributions/k8s/vllm-safety-k8s.yaml.template @@ -41,7 +41,7 @@ spec: image: vllm/vllm-openai:latest command: ["/bin/sh", "-c"] args: [ - "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.3" + "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --port 8001 --gpu-memory-utilization 0.3" ] env: - name: HUGGING_FACE_HUB_TOKEN