mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-12-24 07:08:04 +00:00
adhoc, test, rm storage class
# What does this PR do? ## Test Plan # What does this PR do? ## Test Plan
This commit is contained in:
parent
83c89265e0
commit
2f51129495
9 changed files with 100 additions and 32 deletions
|
|
@ -25,16 +25,8 @@ spec:
|
|||
app.kubernetes.io/name: vllm
|
||||
workload-type: inference
|
||||
spec:
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
- labelSelector:
|
||||
matchExpressions:
|
||||
- key: workload-type
|
||||
operator: In
|
||||
values:
|
||||
- inference
|
||||
topologyKey: kubernetes.io/hostname # Ensures no two inference pods on same node
|
||||
nodeSelector:
|
||||
eks.amazonaws.com/nodegroup: gpu
|
||||
containers:
|
||||
- name: vllm
|
||||
image: vllm/vllm-openai:latest
|
||||
|
|
@ -42,6 +34,8 @@ spec:
|
|||
args:
|
||||
- "vllm serve ${INFERENCE_MODEL} --dtype float16 --enforce-eager --max-model-len 4096 --gpu-memory-utilization 0.6"
|
||||
env:
|
||||
- name: INFERENCE_MODEL
|
||||
value: "${INFERENCE_MODEL}"
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
|
|
@ -49,6 +43,11 @@ spec:
|
|||
key: token
|
||||
ports:
|
||||
- containerPort: 8000
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
||||
requests:
|
||||
nvidia.com/gpu: 1
|
||||
volumeMounts:
|
||||
- name: llama-storage
|
||||
mountPath: /root/.cache/huggingface
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue