mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-10-24 00:47:00 +00:00
78 lines
2.1 KiB
Text
78 lines
2.1 KiB
Text
# -------------------------------------------------
|
|
# NVIDIA NIM - Code
|
|
# -------------------------------------------------
|
|
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: llm-nim-code
|
|
labels:
|
|
app: llm-nim-code
|
|
spec:
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
app: llm-nim-code
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: llm-nim-code
|
|
nim-type: llama-nim
|
|
annotations:
|
|
prometheus.io/scrape: 'true'
|
|
prometheus.io/port: '8000'
|
|
prometheus.io/path: '/v1/metrics'
|
|
spec:
|
|
imagePullSecrets:
|
|
- name: ngc-docker-registry # docker-registry secret: nvcr.io / $oauthtoken / <NGC_DOCKER_API_KEY>
|
|
volumes:
|
|
- name: model-cache
|
|
emptyDir:
|
|
medium: Memory # tmpfs; omit or use "" to back by node disk
|
|
sizeLimit: 12Gi # fits the 4 B model + tensors; adjust if needed
|
|
containers:
|
|
- name: nim
|
|
image: nvcr.io/nim/bigcode/starcoder2-7b:1.8.1
|
|
ports:
|
|
- name: http-openai
|
|
containerPort: 8000
|
|
resources:
|
|
limits:
|
|
nvidia.com/gpu: 1
|
|
env:
|
|
- name: NIM_MODEL_NAME
|
|
value: "nvidia/starcoder2-7b"
|
|
- name: NGC_API_KEY
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: ngc-api
|
|
key: NGC_API_KEY
|
|
- name: NVIDIA_VISIBLE_DEVICES
|
|
value: "all"
|
|
- name: NVIDIA_DRIVER_CAPABILITIES
|
|
value: "compute,utility"
|
|
- name: ENABLE_GPU_METRICS
|
|
value: "true"
|
|
volumeMounts:
|
|
- name: model-cache
|
|
mountPath: /models # default NIM cache path
|
|
readinessProbe:
|
|
httpGet:
|
|
path: /v1/models
|
|
port: 8000
|
|
initialDelaySeconds: 100
|
|
periodSeconds: 100
|
|
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: llm-nim-code
|
|
spec:
|
|
selector:
|
|
app: llm-nim-code
|
|
ports:
|
|
- name: http-openai
|
|
port: 8000
|
|
targetPort: 8000
|
|
type: ClusterIP
|