From d93f6c9e5b71947d665852d49fc765a844275f37 Mon Sep 17 00:00:00 2001 From: Ashwin Bharambe Date: Sun, 1 Jun 2025 15:34:43 -0700 Subject: [PATCH] play around with util --- docs/source/distributions/k8s/stack-k8s.yaml.template | 2 +- docs/source/distributions/k8s/vllm-k8s.yaml.template | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/distributions/k8s/stack-k8s.yaml.template b/docs/source/distributions/k8s/stack-k8s.yaml.template index 198e88aed..3e7df0084 100644 --- a/docs/source/distributions/k8s/stack-k8s.yaml.template +++ b/docs/source/distributions/k8s/stack-k8s.yaml.template @@ -27,7 +27,7 @@ spec: containers: - name: llama-stack image: llamastack/distribution-remote-vllm:latest - imagePullPolicy: IfNotPresent + imagePullPolicy: Always # since we have specified latest instead of a version env: - name: VLLM_URL value: http://vllm-server.default.svc.cluster.local:8000/v1 diff --git a/docs/source/distributions/k8s/vllm-k8s.yaml.template b/docs/source/distributions/k8s/vllm-k8s.yaml.template index ff060ac41..da153a65d 100644 --- a/docs/source/distributions/k8s/vllm-k8s.yaml.template +++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template @@ -32,7 +32,7 @@ spec: image: vllm/vllm-openai:latest command: ["/bin/sh", "-c"] args: - - "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.5" + - "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 6144 --gpu-memory-utilization 0.55" env: - name: HUGGING_FACE_HUB_TOKEN valueFrom: @@ -81,7 +81,7 @@ spec: image: vllm/vllm-openai:latest command: ["/bin/sh", "-c"] args: [ - "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.3" + "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 6144 --gpu-memory-utilization 0.28" ] env: - name: HUGGING_FACE_HUB_TOKEN