From 1bc1f080375bb000a84dfd22f354c42e5dde321b Mon Sep 17 00:00:00 2001
From: Jamie Land <hokie10@gmail.com>
Date: Tue, 18 Mar 2025 11:32:40 -0400
Subject: [PATCH] Adding helm chart for deploying llama-stack

---
 .gitignore                           |   1 +
 chart/.helmignore                    |  23 +++++
 chart/Chart.yaml                     |  24 +++++
 chart/README.md                      |  85 ++++++++++++++++
 chart/files/run.yaml                 | 123 +++++++++++++++++++++++
 chart/templates/_helpers.tpl         |  62 ++++++++++++
 chart/templates/config.yaml          |   7 ++
 chart/templates/deployment.yaml      |  91 +++++++++++++++++
 chart/templates/hpa.yaml             |  32 ++++++
 chart/templates/ingress.yaml         |  43 ++++++++
 chart/templates/openshift/route.yaml |  31 ++++++
 chart/templates/service.yaml         |  15 +++
 chart/templates/serviceaccount.yaml  |  13 +++
 chart/values.yaml                    | 142 +++++++++++++++++++++++++++
 14 files changed, 692 insertions(+)
 create mode 100644 chart/.helmignore
 create mode 100644 chart/Chart.yaml
 create mode 100644 chart/README.md
 create mode 100755 chart/files/run.yaml
 create mode 100644 chart/templates/_helpers.tpl
 create mode 100644 chart/templates/config.yaml
 create mode 100644 chart/templates/deployment.yaml
 create mode 100644 chart/templates/hpa.yaml
 create mode 100644 chart/templates/ingress.yaml
 create mode 100644 chart/templates/openshift/route.yaml
 create mode 100644 chart/templates/service.yaml
 create mode 100644 chart/templates/serviceaccount.yaml
 create mode 100644 chart/values.yaml

diff --git a/.gitignore b/.gitignore
index 0ef25cdf1..76228118b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,3 +23,4 @@ venv/
 pytest-report.xml
 .coverage
 .python-version
+**/local-*
\ No newline at end of file
diff --git a/chart/.helmignore b/chart/.helmignore
new file mode 100644
index 000000000..0e8a0eb36
--- /dev/null
+++ b/chart/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/chart/Chart.yaml b/chart/Chart.yaml
new file mode 100644
index 000000000..e3b6b6069
--- /dev/null
+++ b/chart/Chart.yaml
@@ -0,0 +1,24 @@
+apiVersion: v2
+name: llama-stack
+description: Basic chart for deploying llama-stack
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.1.0
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "1.16.0"
diff --git a/chart/README.md b/chart/README.md
new file mode 100644
index 000000000..eb319924e
--- /dev/null
+++ b/chart/README.md
@@ -0,0 +1,85 @@
+
+# Llama Stack Helm Chart
+
+This Helm chart is designed to install the Llama Stack, a comprehensive platform for llama-related tasks.
+
+The chart provides a convenient way to deploy and manage the Llama Stack on Kubernetes or OpenShift clusters. It offers flexibility in customizing the deployment by allowing users to modify values such as image repositories, probe configurations, resource limits, and more.
+
+Optionally, the chart also supports the installation of the llama-stack-playground, which provides a web-based interface for interacting with the Llama Stack.
+
+## Quick Start
+
+Create a `local-values.yaml` file with the following:
+
+> **Note**
+> Chart currently only supports `vllm` framework directly. But other distributions can be used by modifying the `env` directly.
+
+```yaml
+vllm:
+  url: "https://<MY_VLLM_INSTANCE>:443/v1"
+  inferenceModel: "meta-llama/Llama-3.1-8B-Instruct"
+  apiKey: xxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+```
+
+Login to Kubernetes through the CLI and run:
+
+```sh
+helm upgrade -i ollama-stack . -f local-values.yaml
+```
+
+## Values
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| autoscaling.enabled | bool | `false` |  |
+| autoscaling.maxReplicas | int | `100` |  |
+| autoscaling.minReplicas | int | `1` |  |
+| autoscaling.targetCPUUtilizationPercentage | int | `80` |  |
+| distribution | string | `"distribution-remote-vllm"` |  |
+| image.pullPolicy | string | `"Always"` |  |
+| image.repository | string | `"docker.io/llamastack/{{ $.Values.distribution }}"` |  |
+| image.tag | string | `"0.1.6"` |  |
+| ingress.annotations | object | `{}` |  |
+| ingress.className | string | `""` |  |
+| ingress.enabled | bool | `true` |  |
+| ingress.hosts[0].host | string | `"chart-example.local"` |  |
+| ingress.hosts[0].paths[0].path | string | `"/"` |  |
+| ingress.hosts[0].paths[0].pathType | string | `"ImplementationSpecific"` |  |
+| ingress.tls | list | `[]` |  |
+| livenessProbe.httpGet.path | string | `"/v1/health"` |  |
+| livenessProbe.httpGet.port | int | `5001` |  |
+| podAnnotations | object | `{}` |  |
+| podLabels | object | `{}` |  |
+| podSecurityContext | object | `{}` |  |
+| readinessProbe.httpGet.path | string | `"/v1/health"` |  |
+| readinessProbe.httpGet.port | int | `5001` |  |
+| replicaCount | int | `1` |  |
+| resources.limits.cpu | string | `"100m"` |  |
+| resources.limits.memory | string | `"500Mi"` |  |
+| resources.requests.cpu | string | `"100m"` |  |
+| resources.requests.memory | string | `"500Mi"` |  |
+| route | object | `{"annotations":{},"enabled":false,"host":"","path":"","tls":{"enabled":true,"insecureEdgeTerminationPolicy":"Redirect","termination":"edge"}}` | Enable creation of the OpenShift Route object (This should be used instead of ingress on OpenShift) |
+| route.annotations | object | `{}` | Additional custom annotations for the route |
+| route.host | string | Set by OpenShift | The hostname for the route |
+| route.path | string | `""` | The path for the OpenShift route |
+| route.tls.enabled | bool | `true` | Enable secure route settings |
+| route.tls.insecureEdgeTerminationPolicy | string | `"Redirect"` | Insecure route termination policy |
+| route.tls.termination | string | `"edge"` | Secure route termination policy |
+| runConfig.enabled | bool | `false` |  |
+| service.port | int | `5001` |  |
+| service.type | string | `"ClusterIP"` |  |
+| serviceAccount.annotations | object | `{}` |  |
+| serviceAccount.automount | bool | `true` |  |
+| serviceAccount.create | bool | `false` |  |
+| serviceAccount.name | string | `""` |  |
+| startupProbe.failureThreshold | int | `30` |  |
+| startupProbe.httpGet.path | string | `"/v1/health"` |  |
+| startupProbe.httpGet.port | int | `5001` |  |
+| startupProbe.initialDelaySeconds | int | `40` |  |
+| startupProbe.periodSeconds | int | `10` |  |
+| telemetry.enabled | bool | `false` |  |
+| telemetry.serviceName | string | `"otel-collector.openshift-opentelemetry-operator.svc.cluster.local:4318"` |  |
+| telemetry.sinks | string | `"console,sqlite,otel"` |  |
+| vllm.inferenceModel | string | `"llama2-7b-chat"` |  |
+| vllm.url | string | `"http://vllm-server"` |  |
+| yamlConfig | string | `"/config/run.yaml"` |  |
diff --git a/chart/files/run.yaml b/chart/files/run.yaml
new file mode 100755
index 000000000..14a9aefb2
--- /dev/null
+++ b/chart/files/run.yaml
@@ -0,0 +1,123 @@
+version: '2'
+image_name: vllm-gpu
+apis:
+- agents
+- datasetio
+- eval
+- inference
+- safety
+- scoring
+- telemetry
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: vllm
+    provider_type: remote::vllm
+    config:
+      url: ${env.VLLM_URL:http://localhost:8000}
+      api_token: ${env.VLLM_API_TOKEN}
+      model: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
+      tensor_parallel_size: ${env.TENSOR_PARALLEL_SIZE:1}
+      max_tokens: ${env.MAX_TOKENS:4096}
+      enforce_eager: ${env.ENFORCE_EAGER:False}
+      gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.7}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+    config: {}
+  vector_io:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
+      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
+      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/vllm-gpu/trace_store.db}
+  eval:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+  datasetio:
+  - provider_id: huggingface
+    provider_type: remote::huggingface
+    config: {}
+  - provider_id: localfs
+    provider_type: inline::localfs
+    config: {}
+  scoring:
+  - provider_id: basic
+    provider_type: inline::basic
+    config: {}
+  - provider_id: llm-as-judge
+    provider_type: inline::llm-as-judge
+    config: {}
+  - provider_id: braintrust
+    provider_type: inline::braintrust
+    config:
+      openai_api_key: ${env.OPENAI_API_KEY:}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: code-interpreter
+    provider_type: inline::code-interpreter
+    config: {}
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+    config: {}
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+    config: {}
+metadata_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/registry.db
+models:
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: vllm
+  model_type: llm
+- metadata:
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
+  provider_id: sentence-transformers
+  model_type: embedding
+shields: []
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: brave-search
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
+- toolgroup_id: builtin::code_interpreter
+  provider_id: code-interpreter
+server:
+  port: 8321
diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl
new file mode 100644
index 000000000..2961262d7
--- /dev/null
+++ b/chart/templates/_helpers.tpl
@@ -0,0 +1,62 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "llama-stack.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "llama-stack.fullname" -}}
+{{- if .Values.fullnameOverride }}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- $name := default .Chart.Name .Values.nameOverride }}
+{{- if contains $name .Release.Name }}
+{{- .Release.Name | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "llama-stack.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "llama-stack.labels" -}}
+helm.sh/chart: {{ include "llama-stack.chart" . }}
+{{ include "llama-stack.selectorLabels" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+
+{{/*
+Selector labels
+*/}}
+{{- define "llama-stack.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "llama-stack.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+{{/*
+Create the name of the service account to use
+*/}}
+{{- define "llama-stack.serviceAccountName" -}}
+{{- if .Values.serviceAccount.create }}
+{{- default (include "llama-stack.fullname" .) .Values.serviceAccount.name }}
+{{- else }}
+{{- default "default" .Values.serviceAccount.name }}
+{{- end }}
+{{- end }}
diff --git a/chart/templates/config.yaml b/chart/templates/config.yaml
new file mode 100644
index 000000000..62e18272c
--- /dev/null
+++ b/chart/templates/config.yaml
@@ -0,0 +1,7 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "llama-stack.fullname" . }}-run-config
+data:
+  run.yaml: |-
+    {{- .Files.Get "files/run.yaml" | nindent 4 }}
diff --git a/chart/templates/deployment.yaml b/chart/templates/deployment.yaml
new file mode 100644
index 000000000..7f2a0ea2f
--- /dev/null
+++ b/chart/templates/deployment.yaml
@@ -0,0 +1,91 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "llama-stack.fullname" . }}
+  labels:
+    {{- include "llama-stack.labels" . | nindent 4 }}
+spec:
+  replicas: {{ .Values.replicaCount }}
+  selector:
+    matchLabels:
+      {{- include "llama-stack.selectorLabels" . | nindent 6 }}
+  template:
+    metadata:
+      {{- with .Values.podAnnotations }}
+      annotations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      labels:
+        {{- include "llama-stack.labels" . | nindent 8 }}
+        {{- with .Values.podLabels }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+    spec:
+      {{- with .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      serviceAccountName: {{ include "llama-stack.serviceAccountName" . }}
+      securityContext:
+        {{- toYaml .Values.podSecurityContext | nindent 8 }}
+      containers:
+        - name: {{ .Chart.Name }}
+          securityContext:
+            {{- toYaml .Values.securityContext | nindent 12 }}
+          image: "{{ (tpl .Values.image.repository $) }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          ports:
+            - name: http
+              containerPort: {{ .Values.service.port }}
+              protocol: TCP
+          args:
+            - "--yaml-config"
+            - "/config/run.yaml"
+          env:
+          {{- with .Values.vllm }}
+            - name: VLLM_URL
+              value: {{ .url | quote }}
+            - name: VLLM_API_TOKEN
+              value: {{ .apiKey | default "" | quote}}
+            - name: INFERENCE_MODEL
+              value: {{ .inferenceModel | quote }}
+          {{- end }}
+            - name: LLAMA_STACK_PORT
+              value: {{ .Values.service.port | quote }}
+            {{- if .Values.telemetry.enabled }}
+            - name: TELEMETRY_SINKS
+              value: {{ .Values.telemetry.sinks | quote }}
+            - name: OTEL_SERVICE_NAME
+              value: {{ .Values.telemetry.serviceName | quote }}
+            {{- end }}
+            {{- with .Values.env }}
+            {{- toYaml . | nindent 12 }}
+            {{- end }}
+          livenessProbe:
+            {{- tpl (toYaml .Values.livenessProbe) $ | nindent 12 }}
+          readinessProbe:
+            {{- tpl (toYaml .Values.readinessProbe) $ | nindent 12 }}
+          startupProbe:
+            {{- tpl (toYaml .Values.startupProbe) $ | nindent 12 }}
+          resources:
+            {{- toYaml .Values.resources | nindent 12 }}
+          volumeMounts:
+            - name: config-volume
+              mountPath: /config
+      volumes:
+        - name: config-volume
+          configMap:
+            name: {{ include "llama-stack.fullname" . }}-run-config
+            defaultMode: 0755
+      {{- with .Values.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.affinity }}
+      affinity:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
diff --git a/chart/templates/hpa.yaml b/chart/templates/hpa.yaml
new file mode 100644
index 000000000..75ec6b1fc
--- /dev/null
+++ b/chart/templates/hpa.yaml
@@ -0,0 +1,32 @@
+{{- if .Values.autoscaling.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "llama-stack.fullname" . }}
+  labels:
+    {{- include "llama-stack.labels" . | nindent 4 }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "llama-stack.fullname" . }}
+  minReplicas: {{ .Values.autoscaling.minReplicas }}
+  maxReplicas: {{ .Values.autoscaling.maxReplicas }}
+  metrics:
+    {{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
+    - type: Resource
+      resource:
+        name: cpu
+        target:
+          type: Utilization
+          averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
+    {{- end }}
+    {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
+    - type: Resource
+      resource:
+        name: memory
+        target:
+          type: Utilization
+          averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
+    {{- end }}
+{{- end }}
diff --git a/chart/templates/ingress.yaml b/chart/templates/ingress.yaml
new file mode 100644
index 000000000..251f990f3
--- /dev/null
+++ b/chart/templates/ingress.yaml
@@ -0,0 +1,43 @@
+{{- if .Values.ingress.enabled -}}
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: {{ include "llama-stack.fullname" . }}
+  labels:
+    {{- include "llama-stack.labels" . | nindent 4 }}
+  {{- with .Values.ingress.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  {{- with .Values.ingress.className }}
+  ingressClassName: {{ . }}
+  {{- end }}
+  {{- if .Values.ingress.tls }}
+  tls:
+    {{- range .Values.ingress.tls }}
+    - hosts:
+        {{- range .hosts }}
+        - {{ . | quote }}
+        {{- end }}
+      secretName: {{ .secretName }}
+    {{- end }}
+  {{- end }}
+  rules:
+    {{- range .Values.ingress.hosts }}
+    - host: {{ .host | quote }}
+      http:
+        paths:
+          {{- range .paths }}
+          - path: {{ .path }}
+            {{- with .pathType }}
+            pathType: {{ . }}
+            {{- end }}
+            backend:
+              service:
+                name: {{ include "llama-stack.fullname" $ }}
+                port:
+                  number: {{ $.Values.service.port }}
+          {{- end }}
+    {{- end }}
+{{- end }}
diff --git a/chart/templates/openshift/route.yaml b/chart/templates/openshift/route.yaml
new file mode 100644
index 000000000..d64c89265
--- /dev/null
+++ b/chart/templates/openshift/route.yaml
@@ -0,0 +1,31 @@
+{{- if .Values.route.enabled -}}
+kind: Route
+apiVersion: route.openshift.io/v1
+metadata:
+  name: {{ include "llama-stack.fullname" . }}
+  labels:
+    {{- include "llama-stack.labels" . | nindent 4 }}
+  {{- with .Values.route.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  {{- if .Values.route.host }}
+  host: {{ .Values.route.host }}
+  {{- end }}
+  {{- if .Values.route.path }}
+  path: {{ .Values.route.path }}
+  {{- end }}
+  to:
+    kind: Service
+    name: {{ include "llama-stack.fullname" . }}
+    weight: 100
+  port:
+    targetPort: llama-stack
+  {{- if .Values.route.tls.enabled }}
+  tls:
+    termination: {{ .Values.route.tls.termination }}
+    insecureEdgeTerminationPolicy: {{ .Values.route.tls.insecureEdgeTerminationPolicy }}
+  {{- end }}
+  wildcardPolicy: None
+{{- end }}
\ No newline at end of file
diff --git a/chart/templates/service.yaml b/chart/templates/service.yaml
new file mode 100644
index 000000000..b0d968fc9
--- /dev/null
+++ b/chart/templates/service.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "llama-stack.fullname" . }}
+  labels:
+    {{- include "llama-stack.labels" . | nindent 4 }}
+spec:
+  type: {{ .Values.service.type }}
+  ports:
+    - port: {{ .Values.service.port }}
+      targetPort: http
+      protocol: TCP
+      name: llama-stack
+  selector:
+    {{- include "llama-stack.selectorLabels" . | nindent 4 }}
diff --git a/chart/templates/serviceaccount.yaml b/chart/templates/serviceaccount.yaml
new file mode 100644
index 000000000..655413c42
--- /dev/null
+++ b/chart/templates/serviceaccount.yaml
@@ -0,0 +1,13 @@
+{{- if .Values.serviceAccount.create -}}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ include "llama-stack.serviceAccountName" . }}
+  labels:
+    {{- include "llama-stack.labels" . | nindent 4 }}
+  {{- with .Values.serviceAccount.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
+{{- end }}
diff --git a/chart/values.yaml b/chart/values.yaml
new file mode 100644
index 000000000..65e0ce61c
--- /dev/null
+++ b/chart/values.yaml
@@ -0,0 +1,142 @@
+# yamlConfig: "/config/run.yaml"
+
+# TODO: Currently we are only working for vLLM this should be expanded in the future
+vllm:
+  url: "http://vllm-server"
+  inferenceModel: "llama2-7b-chat"
+  # This is the API key for the VLLM server. It can be set in two ways through a secret:
+  # TODO: Implement this
+  # secret:
+  #   name: vllm-secret
+  #   key: vll
+  # or directly with an api key (should be avoided in production)
+  # apiKey: "xxxxxxxxxxxx"
+
+# https://llama-stack.readthedocs.io/en/latest/distributions/selection.html
+distribution: distribution-remote-vllm
+
+runConfig:
+  enabled: false 
+  # customYaml:
+  # Your custom run.yaml configuration file can be pasted here
+  # If not set, the default run.yaml file in the `files/run.yaml` will be used
+
+telemetry:
+  enabled: false
+  serviceName: "otel-collector.openshift-opentelemetry-operator.svc.cluster.local:4318"
+  sinks: "console,sqlite,otel"
+  
+#  Use to allow for other env variables to be passed to the container
+# env:
+#   MY_CUSTOM_ENV_VAR: "my-custom-env-var-value"
+
+replicaCount: 1
+
+# This sets the container image more information can be found here: https://kubernetes.io/docs/concepts/containers/images/
+image:
+  repository: docker.io/llamastack/{{ $.Values.distribution }}
+  tag: 0.1.6
+  # This sets the pull policy for images.
+  pullPolicy: Always
+
+
+# This section builds out the service account more information can be found here: https://kubernetes.io/docs/concepts/security/service-accounts/
+serviceAccount:
+  # Specifies whether a service account should be created
+  create: false
+  # Automatically mount a ServiceAccount's API credentials?
+  automount: true
+  # Annotations to add to the service account
+  annotations: {}
+  # The name of the service account to use.
+  # If not set and create is true, a name is generated using the fullname template
+  name: ""
+
+# This is for setting Kubernetes Annotations to a Pod.
+# For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/
+podAnnotations: {}
+# This is for setting Kubernetes Labels to a Pod.
+# For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
+podLabels: {}
+
+podSecurityContext: {}
+  # fsGroup: 2000
+
+# This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/
+service:
+  # This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types
+  type: ClusterIP
+  # This sets the ports more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#field-spec-ports
+  port: 5001
+
+
+# This block is for setting up the ingress for more information can be found here: https://kubernetes.io/docs/concepts/services-networking/ingress/
+ingress:
+  enabled: true
+  className: ""
+  annotations: {}
+    # kubernetes.io/ingress.class: nginx
+    # kubernetes.io/tls-acme: "true"
+  hosts:
+    - host: chart-example.local
+      paths:
+        - path: /
+          pathType: ImplementationSpecific
+  tls: []
+  #  - secretName: chart-example-tls
+  #    hosts:
+  #      - chart-example.local
+
+
+# -- Enable creation of the OpenShift Route object (This should be used instead of ingress on OpenShift)
+route:
+  enabled: false
+  # Allow OCP to determine the host if left blank
+  # -- The hostname for the route
+  # @default -- Set by OpenShift
+  host: ""
+  # -- The path for the OpenShift route
+  path: ""
+  tls:
+    # -- Enable secure route settings
+    enabled: true
+    # -- Secure route termination policy
+    termination: edge
+    # -- Insecure route termination policy
+    insecureEdgeTerminationPolicy: Redirect
+  # -- Additional custom annotations for the route
+  annotations: {}
+
+
+resources:
+  limits:
+    cpu: 100m
+    memory: 500Mi
+  requests:
+    cpu: 100m
+    memory: 500Mi
+
+# This is to setup the liveness and readiness probes more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/
+livenessProbe:
+  httpGet:
+    path: /v1/health
+    port: 5001
+readinessProbe:
+  httpGet:
+    path: /v1/health
+    port: 5001
+startupProbe:
+  httpGet:
+    path: /v1/health
+    port: 5001
+  initialDelaySeconds: 40
+  periodSeconds: 10
+  failureThreshold: 30
+  
+# This section is for setting up autoscaling more information can be found here: https://kubernetes.io/docs/concepts/workloads/autoscaling/
+autoscaling:
+  enabled: false
+  minReplicas: 1
+  maxReplicas: 100
+  targetCPUUtilizationPercentage: 80
+  # targetMemoryUtilizationPercentage: 80