llama-stack-mirror/benchmarking/k8s-benchmark/stack-k8s.yaml.template

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: llama-benchmark-pvc
spec:
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 1Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: llama-stack-benchmark-server
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: llama-stack-benchmark
      app.kubernetes.io/component: server
  template:
    metadata:
      labels:
        app.kubernetes.io/name: llama-stack-benchmark
        app.kubernetes.io/component: server
    spec:
      containers:
      - name: llama-stack-benchmark
        image: llamastack/distribution-starter:latest
        imagePullPolicy: Always # since we have specified latest instead of a version
        env:
        - name: ENABLE_CHROMADB
          value: "true"
        - name: CHROMADB_URL
          value: http://chromadb.default.svc.cluster.local:6000
        - name: POSTGRES_HOST
          value: postgres-server.default.svc.cluster.local
        - name: POSTGRES_PORT
          value: "5432"
        - name: INFERENCE_MODEL
          value: "${INFERENCE_MODEL}"
        - name: SAFETY_MODEL
          value: "${SAFETY_MODEL}"
        - name: TAVILY_SEARCH_API_KEY
          value: "${TAVILY_SEARCH_API_KEY}"
        - name: VLLM_URL
          value: http://vllm-server.default.svc.cluster.local:8000/v1
        - name: VLLM_MAX_TOKENS
          value: "3072"
        - name: VLLM_SAFETY_URL
          value: http://vllm-server-safety.default.svc.cluster.local:8001/v1
        - name: VLLM_TLS_VERIFY
          value: "false"
        - name: LLAMA_STACK_LOGGING
          value: "all=WARNING"
        - name: LLAMA_STACK_CONFIG
          value: "/etc/config/stack_run_config.yaml"
        - name: LLAMA_STACK_WORKERS
          value: "${LLAMA_STACK_WORKERS}"
        command: ["uvicorn", "llama_stack.core.server.server:create_app", "--host", "0.0.0.0", "--port", "8323", "--workers", "$LLAMA_STACK_WORKERS", "--factory"]
        ports:
          - containerPort: 8323
        resources:
          requests:
            cpu: "${LLAMA_STACK_WORKERS}"
          limits:
            cpu: "${LLAMA_STACK_WORKERS}"
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.llama
          - name: llama-config
            mountPath: /etc/config
      volumes:
      - name: llama-storage
        persistentVolumeClaim:
          claimName: llama-benchmark-pvc
      - name: llama-config
        configMap:
          name: llama-stack-config
---
apiVersion: v1
kind: Service
metadata:
  name: llama-stack-benchmark-service
spec:
  selector:
    app.kubernetes.io/name: llama-stack-benchmark
    app.kubernetes.io/component: server
  ports:
  - name: http
    port: 8323
    targetPort: 8323
  type: ClusterIP