play around with util

This commit is contained in:
Ashwin Bharambe 2025-06-01 15:34:43 -07:00
parent a36b0c5fe3
commit d93f6c9e5b
2 changed files with 3 additions and 3 deletions

View file

@ -27,7 +27,7 @@ spec:
containers: containers:
- name: llama-stack - name: llama-stack
image: llamastack/distribution-remote-vllm:latest image: llamastack/distribution-remote-vllm:latest
imagePullPolicy: IfNotPresent imagePullPolicy: Always # since we have specified latest instead of a version
env: env:
- name: VLLM_URL - name: VLLM_URL
value: http://vllm-server.default.svc.cluster.local:8000/v1 value: http://vllm-server.default.svc.cluster.local:8000/v1

View file

@ -32,7 +32,7 @@ spec:
image: vllm/vllm-openai:latest image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"] command: ["/bin/sh", "-c"]
args: args:
- "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.5" - "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 6144 --gpu-memory-utilization 0.55"
env: env:
- name: HUGGING_FACE_HUB_TOKEN - name: HUGGING_FACE_HUB_TOKEN
valueFrom: valueFrom:
@ -81,7 +81,7 @@ spec:
image: vllm/vllm-openai:latest image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"] command: ["/bin/sh", "-c"]
args: [ args: [
"vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.3" "vllm serve ${SAFETY_MODEL} --dtype float16 --enforce-eager --max-model-len 6144 --gpu-memory-utilization 0.28"
] ]
env: env:
- name: HUGGING_FACE_HUB_TOKEN - name: HUGGING_FACE_HUB_TOKEN