llama-stack-mirror/docs/source/distributions/k8s/llama-nim.yaml.template
2025-08-05 13:33:32 -07:00

78 lines
2.1 KiB
Text

# -------------------------------------------------
# NVIDIA NIM - Code
# -------------------------------------------------
apiVersion: apps/v1
kind: Deployment
metadata:
name: llm-nim-code
labels:
app: llm-nim-code
spec:
replicas: 1
selector:
matchLabels:
app: llm-nim-code
template:
metadata:
labels:
app: llm-nim-code
nim-type: llama-nim
annotations:
prometheus.io/scrape: 'true'
prometheus.io/port: '8000'
prometheus.io/path: '/v1/metrics'
spec:
imagePullSecrets:
- name: ngc-docker-registry # docker-registry secret: nvcr.io / $oauthtoken / <NGC_DOCKER_API_KEY>
volumes:
- name: model-cache
emptyDir:
medium: Memory # tmpfs; omit or use "" to back by node disk
sizeLimit: 12Gi # fits the 4 B model + tensors; adjust if needed
containers:
- name: nim
image: nvcr.io/nim/bigcode/starcoder2-7b:1.8.1
ports:
- name: http-openai
containerPort: 8000
resources:
limits:
nvidia.com/gpu: 1
env:
- name: NIM_MODEL_NAME
value: "nvidia/starcoder2-7b"
- name: NGC_API_KEY
valueFrom:
secretKeyRef:
name: ngc-api
key: NGC_API_KEY
- name: NVIDIA_VISIBLE_DEVICES
value: "0"
- name: NVIDIA_DRIVER_CAPABILITIES
value: "compute,utility"
- name: ENABLE_GPU_METRICS
value: "true"
volumeMounts:
- name: model-cache
mountPath: /models # default NIM cache path
readinessProbe:
httpGet:
path: /v1/models
port: 8000
initialDelaySeconds: 100
periodSeconds: 100
---
apiVersion: v1
kind: Service
metadata:
name: llm-nim-code
spec:
selector:
app: llm-nim-code
ports:
- name: http-openai
port: 8000
targetPort: 8000
type: ClusterIP