This commit is contained in:
Kai Wu 2025-08-03 14:01:27 -07:00
parent 4e19f15bca
commit dcc47c2008
6 changed files with 351 additions and 8927 deletions

View file

@ -39,7 +39,7 @@ spec:
image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"]
args:
- "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.7 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 4 --port 8001"
- "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 100000 --gpu-memory-utilization 0.9 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 2 --port 8001"
env:
- name: INFERENCE_MODEL
value: "${INFERENCE_MODEL}"