# ------------------------------------------------- # NVIDIA NIM — Llama-3 1 Nemotron-Nano-4B-v1 1 # ------------------------------------------------- apiVersion: apps/v1 kind: Deployment metadata: name: llama-nano-nim labels: app: llama-nano-nim spec: replicas: 1 selector: matchLabels: app: llama-nano-nim template: metadata: labels: app: llama-nano-nim spec: imagePullSecrets: - name: ngc-secret # docker-registry secret: nvcr.io / $oauthtoken / volumes: - name: model-cache emptyDir: medium: Memory # tmpfs; omit or use "" to back by node disk sizeLimit: 12Gi # fits the 4 B model + tensors; adjust if needed containers: - name: nim image: nvcr.io/nim/meta/llama-3_1-nemotron-nano-4b-v1_1:1.0.0 ports: - name: http-openai containerPort: 8000 resources: limits: nvidia.com/gpu: 1 env: - name: NIM_MODEL_NAME value: "nvidia/llama-3_1-nemotron-nano-4b-v1_1" - name: NGC_API_KEY valueFrom: secretKeyRef: name: ngc-api key: NGC_API_KEY volumeMounts: - name: model-cache mountPath: /models # default NIM cache path readinessProbe: httpGet: path: /v1/models port: http-openai initialDelaySeconds: 20 periodSeconds: 10 livenessProbe: httpGet: path: /v1/health port: http-openai initialDelaySeconds: 60 periodSeconds: 30 --- apiVersion: v1 kind: Service metadata: name: llama-nano-nim spec: selector: app: llama-nano-nim ports: - name: http-openai port: 8000 targetPort: 8000 type: ClusterIP