mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-07-18 02:42:31 +00:00
make it work on gpus
This commit is contained in:
parent
ee96c4891b
commit
f99ca37f91
7 changed files with 30 additions and 21 deletions
|
@ -26,16 +26,8 @@ spec:
|
|||
app.kubernetes.io/name: vllm-safety
|
||||
workload-type: inference
|
||||
spec:
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
- labelSelector:
|
||||
matchExpressions:
|
||||
- key: workload-type
|
||||
operator: In
|
||||
values:
|
||||
- inference
|
||||
topologyKey: kubernetes.io/hostname # Ensures no two inference pods on same node
|
||||
nodeSelector:
|
||||
eks.amazonaws.com/nodegroup: gpu
|
||||
containers:
|
||||
- name: vllm-safety
|
||||
image: vllm/vllm-openai:latest
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue