Addressing comments

2026-01-02 03:04:31 +00:00 · 2025-03-18 19:42:23 -04:00 · 2025-03-18 19:42:23 -04:00 · e7fbe76151
commit e7fbe76151
parent fc627f5c57
14 changed files with 11 additions and 4 deletions
--- a/charts/llama-stack/.helmignore
+++ b/charts/llama-stack/.helmignore
@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
--- a/charts/llama-stack/Chart.yaml
+++ b/charts/llama-stack/Chart.yaml
@ -0,0 +1,25 @@
+apiVersion: v2
+name: llama-stack
+icon: https://helm.sh/img/helm.svg
+description: Basic chart for deploying llama-stack
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: 0.1.0
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "0.1.6"
--- a/charts/llama-stack/README.md
+++ b/charts/llama-stack/README.md
@ -0,0 +1,103 @@
+# Llama Stack Helm Chart
+
+This Helm chart is designed to install the Llama Stack, a comprehensive platform for llama-related tasks.
+
+The chart provides a convenient way to deploy and manage the Llama Stack on Kubernetes or OpenShift clusters. It offers flexibility in customizing the deployment by allowing users to modify values such as image repositories, probe configurations, resource limits, and more.
+
+Optionally, the chart also supports the installation of the llama-stack-playground, which provides a web-based interface for interacting with the Llama Stack.
+
+## Quick Start
+
+Create a `local-values.yaml` file with the following:
+
+> **Note**
+> Chart currently only supports `vllm` framework directly. But other distributions can managed by adding to the `env` inside the values file directly.
+
+```yaml
+
+distribution: distribution-remote-vllm
+
+vllm:
+  url: "https://<MY_VLLM_INSTANCE>:443/v1"
+  inferenceModel: "meta-llama/Llama-3.1-8B-Instruct"
+  apiKey: xxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+```
+
+Login to Kubernetes through the CLI and run:
+
+```sh
+helm upgrade -i llama-stack . -f local-values.yaml
+```
+
+> [!TIP]
+> Can be installed on [minikube](https://minikube.sigs.k8s.io/docs/start/?arch=%2Flinux%2Fx86-64%2Fstable%2Fbinary+download) to for local validation.
+
+## Custom Configuration
+
+By default llama-stack will use the run.yaml config that comes with the specified distribution. For more granular control the `customRunConfig` can be set to true, in which case the helm chart will use the values inside of the `files/run.yaml` instead.
+
+## Values
+
+### Llama Stack Specific
+
+| Key                     | Type     | Default                                                                    | Description                                                                                                                           |
+| :---------------------- | :------- | :------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------ |
+| `customRunConfig`       | `bool`   | `false`                                                                    | Indicates whether a custom run configuration is being used.                                                                           |
+| `distribution`          | `string` | `"distribution-remote-vllm"`                                               | Specifies the distribution or type of deployment being used (in this case, related to a remote vLLM distribution).                    |
+| `telemetry.enabled`     | `bool`   | `false`                                                                    | Enables or disables telemetry collection.                                                                                             |
+| `telemetry.serviceName` | `string` | `"otel-collector.openshift-opentelemetry-operator.svc.cluster.local:4318"` | The service name and address of the telemetry collector.                                                                              |
+| `telemetry.sinks`       | `string` | `"console,sqlite,otel"`                                                    | Specifies the destinations or sinks where telemetry data will be sent.                                                                |
+| `vllm.inferenceModel`   | `string` | `"llama2-7b-chat"`                                                         | The specific inference model to be used by vLLM (a high-throughput and memory-efficient inference service for large language models). |
+| `vllm.url`              | `string` | `"http://vllm-server"`                                                     | The URL of the vLLM service.                                                                                                          |
+| `env`                   | `object` | N/A                                                                        | A set of key/value pairs that can be set in the pod                                                                                   |
+
+### General
+
+| Key                                        | Type   | Default                                                                                                                            | Description                                                                                                                                                                                                                                                           |
+| :----------------------------------------- | :----- | :----------------------------------------------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `autoscaling.enabled`                      | `bool` | `false`                                                                                                                              | Enables or disables horizontal pod autoscaling, which automatically adjusts the number of running instances based on CPU utilization.                                                                                                                             |
+| `autoscaling.maxReplicas`                  | `int`  | `100`                                                                                                                                | The maximum number of pod replicas that the autoscaler can scale up to.                                                                                                                                                                                               |
+| `autoscaling.minReplicas`                  | `int`  | `1`                                                                                                                                  | The minimum number of pod replicas that will always be running.                                                                                                                                                                                                      |
+| `autoscaling.targetCPUUtilizationPercentage` | `int`  | `80`                                                                                                                                 | The target average CPU utilization across all running pods that the autoscaler will aim to maintain.                                                                                                                                                                    |
+| `image.pullPolicy`                         | `string` | `"Always"`                                                                                                                           | Defines when to pull the Docker image for the container (e.g., always pull, pull if not present, etc.).                                                                                                                                                           |
+| `image.repository`                         | `string` | `"docker.io/llamastack/{{ $.Values.distribution }}"`                                                                                 | The Docker image repository where the container image is located. It likely uses the `distribution` value to construct the full image path.                                                                                                                            |
+| `image.tag`                                | `string` | `"0.1.6"`                                                                                                                            | The specific version tag of the Docker image to use.                                                                                                                                                                                                                 |
+| `ingress.annotations`                      | `object` | `{}`                                                                                                                                 | Kubernetes Ingress annotations, which can be used to configure load balancers and other external access settings.                                                                                                                                                    |
+| `ingress.className`                        | `string` | `""`                                                                                                                                 | The name of the Ingress controller to use for this Ingress resource.                                                                                                                                                                                                 |
+| `ingress.enabled`                          | `bool` | `true`                                                                                                                               | Enables or disables the creation of a Kubernetes Ingress resource, which allows external access to the application.                                                                                                                                                  |
+| `ingress.hosts[0].host`                    | `string` | `"chart-example.local"`                                                                                                            | The hostname that the Ingress will route traffic to. This is often a placeholder or example.                                                                                                                                                                             |
+| `ingress.hosts[0].paths[0].path`          | `string` | `"/"`                                                                                                                                  | The path on the specified host that the Ingress will route traffic to (in this case, the root path).                                                                                                                                                                     |
+| `ingress.hosts[0].paths[0].pathType`      | `string` | `"ImplementationSpecific"`                                                                                                         | The type of path matching used by the Ingress controller.                                                                                                                                                                                                            |
+| `ingress.tls`                              | `list`   | `[]`                                                                                                                                 | Configuration for Transport Layer Security (TLS) termination at the Ingress, allowing for HTTPS.                                                                                                                                                                     |
+| `livenessProbe.httpGet.path`               | `string` | `"/v1/health"`                                                                                                                      | The HTTP endpoint path that the liveness probe will check to determine if the container is running and healthy.                                                                                                                                                           |
+| `livenessProbe.httpGet.port`               | `int`  | `5001`                                                                                                                               | The port that the liveness probe will connect to for the HTTP health check.                                                                                                                                                                                           |
+| `podAnnotations`                           | `object` | `{}`                                                                                                                                 | Kubernetes Pod annotations, which can be used to attach arbitrary non-identifying metadata to the Pod.                                                                                                                                                                 |
+| `podLabels`                                | `object` | `{}`                                                                                                                                 | Kubernetes Pod labels, which are key/value pairs that are attached to Pods and can be used for organizing and selecting groups of Pods.                                                                                                                                    |
+| `podSecurityContext`                       | `object` | `{}`                                                                                                                                 | Defines the security context for the Pod, such as user and group IDs, security capabilities, etc.                                                                                                                                                                      |
+| `readinessProbe.httpGet.path`              | `string` | `"/v1/health"`                                                                                                                      | The HTTP endpoint path that the readiness probe will check to determine if the container is ready to serve traffic.                                                                                                                                                           |
+| `readinessProbe.httpGet.port`              | `int`  | `5001`                                                                                                                               | The port that the readiness probe will connect to for the HTTP readiness check.                                                                                                                                                                                          |
+| `replicaCount`                             | `int`  | `1`                                                                                                                                  | The desired number of pod replicas to run.                                                                                                                                                                                                                         |
+| `resources.limits.cpu`                     | `string` | `"100m"`                                                                                                                             | The maximum amount of CPU resources that a container can use (in millicores).                                                                                                                                                                                            |
+| `resources.limits.memory`                  | `string` | `"500Mi"`                                                                                                                             | The maximum amount of memory that a container can use (in megabytes).                                                                                                                                                                                                  |
+| `resources.requests.cpu`                   | `string` | `"100m"`                                                                                                                             | The amount of CPU resources that Kubernetes will guarantee to be available for the container.                                                                                                                                                                              |
+| `resources.requests.memory`                | `string` | `"500Mi"`                                                                                                                             | The amount of memory that Kubernetes will guarantee to be available for the container (in megabytes).                                                                                                                                                                     |
+| `route`                                    | `object` | `{"annotations":{},"enabled":false,"host":"","path":"","tls":{"enabled":true,"insecureEdgeTerminationPolicy":"Redirect","termination":"edge"}}` | Configuration for an OpenShift Route object, which is used for exposing services externally on OpenShift.                                                                                                                                                           |
+| `route.annotations`                        | `object` | `{}`                                                                                                                                 | Additional custom annotations for the OpenShift Route object.                                                                                                                                                                                                        |
+| `route.host`                               | `string` | `Set by OpenShift`                                                                                                                   | The hostname for the OpenShift Route. This is typically managed by OpenShift.                                                                                                                                                                                           |
+| `route.path`                               | `string` | `""`                                                                                                                                 | The path for the OpenShift Route.                                                                                                                                                                                                                                    |
+| `route.tls.enabled`                        | `bool` | `true`                                                                                                                               | Enables or disables TLS for the OpenShift Route, providing secure communication.                                                                                                                                                                                          |
+| `route.tls.insecureEdgeTerminationPolicy`    | `string` | `"Redirect"`                                                                                                                         | The policy for handling insecure (HTTP) requests when TLS termination is at the edge (Route).                                                                                                                                                                         |
+| `route.tls.termination`                    | `string` | `"edge"`                                                                                                                             | Specifies that TLS termination occurs at the OpenShift Route edge.                                                                                                                                                                                                      |
+| `service.port`                             | `int`  | `5001`                                                                                                                               | The port on which the Kubernetes Service will be exposed internally within the cluster.                                                                                                                                                                                  |
+| `service.type`                             | `string` | `"ClusterIP"`                                                                                                                        | The type of Kubernetes Service. `ClusterIP` makes the service only reachable from within the cluster.                                                                                                                                                                 |
+| `serviceAccount.annotations`               | `object` | `{}`                                                                                                                                 | Annotations for the Kubernetes ServiceAccount.                                                                                                                                                                                                                       |
+| `serviceAccount.automount`                 | `bool` | `true`                                                                                                                               | Indicates whether the ServiceAccount token should be automatically mounted into the Pods.                                                                                                                                                                            |
+| `serviceAccount.create`                    | `bool` | `false`                                                                                                                              | Determines whether a new Kubernetes ServiceAccount should be created.                                                                                                                                                                                                 |
+| `serviceAccount.name`                      | `string` | `""`                                                                                                                                 | The name of an existing Kubernetes ServiceAccount to use. If `create` is true and this is empty, a default name will be generated.                                                                                                                                     |
+| `startupProbe.failureThreshold`            | `int`  | `30`                                                                                                                                 | The number of consecutive failures of the startup probe before Kubernetes considers the container failed to start.                                                                                                                                                  |
+| `startupProbe.httpGet.path`                | `string` | `"/v1/health"`                                                                                                                      | The HTTP endpoint path for the startup probe, used to determine if the application has started successfully.                                                                                                                                                           |
+| `startupProbe.httpGet.port`                | `int`  | `5001`                                                                                                                               | The port for the HTTP startup probe.                                                                                                                                                                                                                                 |
+| `startupProbe.initialDelaySeconds`         | `int`  | `40`                                                                                                                                 | The number of seconds to wait after the container has started before the startup probe is first initiated.                                                                                                                                                            |
+| `startupProbe.periodSeconds`               | `int`  | `10`                                                                                                                                 | The interval (in seconds) at which the startup probe will be executed.                                                                                                                                                                                               |
+| `volumeMounts`                             | `list`   | `[]`                                                                                                                                 | A list of volume mounts that define how volumes should be mounted into the container's filesystem.                                                                                                                                                                   |
+| `volumes`                                  | `list`   | `[]`                                                                                                                                 | A list of volume definitions that provide storage for the Pod.                                                                                                                                                                                                          |
--- a/charts/llama-stack/files/run.yaml
+++ b/charts/llama-stack/files/run.yaml
@ -0,0 +1,123 @@
+version: '2'
+image_name: vllm-gpu
+apis:
+- agents
+- datasetio
+- eval
+- inference
+- safety
+- scoring
+- telemetry
+- tool_runtime
+- vector_io
+providers:
+  inference:
+  - provider_id: vllm
+    provider_type: remote::vllm
+    config:
+      url: ${env.VLLM_URL:http://localhost:8000}
+      api_token: ${env.VLLM_API_TOKEN}
+      model: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
+      tensor_parallel_size: ${env.TENSOR_PARALLEL_SIZE:1}
+      max_tokens: ${env.MAX_TOKENS:4096}
+      enforce_eager: ${env.ENFORCE_EAGER:False}
+      gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.7}
+  - provider_id: sentence-transformers
+    provider_type: inline::sentence-transformers
+    config: {}
+  vector_io:
+  - provider_id: faiss
+    provider_type: inline::faiss
+    config:
+      kvstore:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/faiss_store.db
+  safety:
+  - provider_id: llama-guard
+    provider_type: inline::llama-guard
+    config: {}
+  agents:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      persistence_store:
+        type: sqlite
+        namespace: null
+        db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/agents_store.db
+  telemetry:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config:
+      service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
+      sinks: ${env.TELEMETRY_SINKS:console,sqlite}
+      sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/vllm-gpu/trace_store.db}
+  eval:
+  - provider_id: meta-reference
+    provider_type: inline::meta-reference
+    config: {}
+  datasetio:
+  - provider_id: huggingface
+    provider_type: remote::huggingface
+    config: {}
+  - provider_id: localfs
+    provider_type: inline::localfs
+    config: {}
+  scoring:
+  - provider_id: basic
+    provider_type: inline::basic
+    config: {}
+  - provider_id: llm-as-judge
+    provider_type: inline::llm-as-judge
+    config: {}
+  - provider_id: braintrust
+    provider_type: inline::braintrust
+    config:
+      openai_api_key: ${env.OPENAI_API_KEY:}
+  tool_runtime:
+  - provider_id: brave-search
+    provider_type: remote::brave-search
+    config:
+      api_key: ${env.BRAVE_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: tavily-search
+    provider_type: remote::tavily-search
+    config:
+      api_key: ${env.TAVILY_SEARCH_API_KEY:}
+      max_results: 3
+  - provider_id: code-interpreter
+    provider_type: inline::code-interpreter
+    config: {}
+  - provider_id: rag-runtime
+    provider_type: inline::rag-runtime
+    config: {}
+  - provider_id: model-context-protocol
+    provider_type: remote::model-context-protocol
+    config: {}
+metadata_store:
+  type: sqlite
+  db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/registry.db
+models:
+- metadata: {}
+  model_id: ${env.INFERENCE_MODEL}
+  provider_id: vllm
+  model_type: llm
+- metadata:
+    embedding_dimension: 384
+  model_id: all-MiniLM-L6-v2
+  provider_id: sentence-transformers
+  model_type: embedding
+shields: []
+vector_dbs: []
+datasets: []
+scoring_fns: []
+benchmarks: []
+tool_groups:
+- toolgroup_id: builtin::websearch
+  provider_id: brave-search
+- toolgroup_id: builtin::rag
+  provider_id: rag-runtime
+- toolgroup_id: builtin::code_interpreter
+  provider_id: code-interpreter
+server:
+  port: 8321
--- a/charts/llama-stack/templates/_helpers.tpl
+++ b/charts/llama-stack/templates/_helpers.tpl
@ -0,0 +1,62 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "llama-stack.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "llama-stack.fullname" -}}
+{{- if .Values.fullnameOverride }}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- $name := default .Chart.Name .Values.nameOverride }}
+{{- if contains $name .Release.Name }}
+{{- .Release.Name | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{- end }}
+{{- end }}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "llama-stack.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Common labels
+*/}}
+{{- define "llama-stack.labels" -}}
+helm.sh/chart: {{ include "llama-stack.chart" . }}
+{{ include "llama-stack.selectorLabels" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+
+{{/*
+Selector labels
+*/}}
+{{- define "llama-stack.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "llama-stack.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+{{/*
+Create the name of the service account to use
+*/}}
+{{- define "llama-stack.serviceAccountName" -}}
+{{- if .Values.serviceAccount.create }}
+{{- default (include "llama-stack.fullname" .) .Values.serviceAccount.name }}
+{{- else }}
+{{- default "default" .Values.serviceAccount.name }}
+{{- end }}
+{{- end }}
--- a/charts/llama-stack/templates/config.yaml
+++ b/charts/llama-stack/templates/config.yaml
@ -0,0 +1,9 @@
+{{- if .Values.customRunConfig }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "llama-stack.fullname" . }}-run-config
+data:
+  run.yaml: |-
+    {{- .Files.Get "files/run.yaml" | nindent 4 }}
+{{- end }}
--- a/charts/llama-stack/templates/deployment.yaml
+++ b/charts/llama-stack/templates/deployment.yaml
@ -0,0 +1,105 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "llama-stack.fullname" . }}
+  labels:
+    {{- include "llama-stack.labels" . | nindent 4 }}
+  annotations:
+    checksum/run-config: {{ include (print $.Template.BasePath "/config.yaml") . | sha256sum }}
+spec:
+  replicas: {{ .Values.replicaCount }}
+  selector:
+    matchLabels:
+      {{- include "llama-stack.selectorLabels" . | nindent 6 }}
+  template:
+    metadata:
+      {{- with .Values.podAnnotations }}
+      annotations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      labels:
+        {{- include "llama-stack.labels" . | nindent 8 }}
+        {{- with .Values.podLabels }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+    spec:
+      {{- with .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      serviceAccountName: {{ include "llama-stack.serviceAccountName" . }}
+      securityContext:
+        {{- toYaml .Values.podSecurityContext | nindent 8 }}
+      containers:
+        - name: {{ .Chart.Name }}
+          securityContext:
+            {{- toYaml .Values.securityContext | nindent 12 }}
+          image: "{{ (tpl .Values.image.repository $) }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          ports:
+            - name: http
+              containerPort: {{ .Values.service.port }}
+              protocol: TCP
+          {{- if .Values.customRunConfig }}
+          args:
+            - "--yaml-config"
+            - "/config/run.yaml"
+          {{- end }}
+          env:
+          {{- with .Values.vllm }}
+            - name: VLLM_URL
+              value: {{ .url | quote }}
+            - name: VLLM_API_TOKEN
+              value: {{ .apiKey | default "" | quote}}
+            - name: INFERENCE_MODEL
+              value: {{ .inferenceModel | quote }}
+          {{- end }}
+            - name: LLAMA_STACK_PORT
+              value: {{ .Values.service.port | quote }}
+            {{- if .Values.telemetry.enabled }}
+            - name: TELEMETRY_SINKS
+              value: {{ .Values.telemetry.sinks | quote }}
+            - name: OTEL_SERVICE_NAME
+              value: {{ .Values.telemetry.serviceName | quote }}
+            {{- end }}
+            {{- with .Values.env }}
+            {{- toYaml . | nindent 12 }}
+            {{- end }}
+          livenessProbe:
+            {{- tpl (toYaml .Values.livenessProbe) $ | nindent 12 }}
+          readinessProbe:
+            {{- tpl (toYaml .Values.readinessProbe) $ | nindent 12 }}
+          startupProbe:
+            {{- tpl (toYaml .Values.startupProbe) $ | nindent 12 }}
+          resources:
+            {{- toYaml .Values.resources | nindent 12 }}
+          {{- if or .Values.customRunConfig .Values.volumeMounts }}
+          volumeMounts:
+            - name: config-volume
+              mountPath: /config
+            {{- with .Values.volumeMounts }}
+              {{- toYaml . | nindent 12 }}
+            {{- end }}
+          {{- end }}
+      {{- if or .Values.customRunConfig .Values.volumes }}
+      volumes:
+        - name: config-volume
+          configMap:
+            name: {{ include "llama-stack.fullname" . }}-run-config
+            defaultMode: 0755
+        {{- with .Values.volumes }}
+          {{- toYaml . | nindent 8 }}
+        {{- end }}
+        {{- end }}
+      {{- with .Values.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.affinity }}
+      affinity:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
--- a/charts/llama-stack/templates/hpa.yaml
+++ b/charts/llama-stack/templates/hpa.yaml
@ -0,0 +1,32 @@
+{{- if .Values.autoscaling.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "llama-stack.fullname" . }}
+  labels:
+    {{- include "llama-stack.labels" . | nindent 4 }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "llama-stack.fullname" . }}
+  minReplicas: {{ .Values.autoscaling.minReplicas }}
+  maxReplicas: {{ .Values.autoscaling.maxReplicas }}
+  metrics:
+    {{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
+    - type: Resource
+      resource:
+        name: cpu
+        target:
+          type: Utilization
+          averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
+    {{- end }}
+    {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
+    - type: Resource
+      resource:
+        name: memory
+        target:
+          type: Utilization
+          averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
+    {{- end }}
+{{- end }}
--- a/charts/llama-stack/templates/ingress.yaml
+++ b/charts/llama-stack/templates/ingress.yaml
@ -0,0 +1,43 @@
+{{- if .Values.ingress.enabled -}}
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: {{ include "llama-stack.fullname" . }}
+  labels:
+    {{- include "llama-stack.labels" . | nindent 4 }}
+  {{- with .Values.ingress.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  {{- with .Values.ingress.className }}
+  ingressClassName: {{ . }}
+  {{- end }}
+  {{- if .Values.ingress.tls }}
+  tls:
+    {{- range .Values.ingress.tls }}
+    - hosts:
+        {{- range .hosts }}
+        - {{ . | quote }}
+        {{- end }}
+      secretName: {{ .secretName }}
+    {{- end }}
+  {{- end }}
+  rules:
+    {{- range .Values.ingress.hosts }}
+    - host: {{ .host | quote }}
+      http:
+        paths:
+          {{- range .paths }}
+          - path: {{ .path }}
+            {{- with .pathType }}
+            pathType: {{ . }}
+            {{- end }}
+            backend:
+              service:
+                name: {{ include "llama-stack.fullname" $ }}
+                port:
+                  number: {{ $.Values.service.port }}
+          {{- end }}
+    {{- end }}
+{{- end }}
--- a/charts/llama-stack/templates/openshift/route.yaml
+++ b/charts/llama-stack/templates/openshift/route.yaml
@ -0,0 +1,31 @@
+{{- if .Values.route.enabled -}}
+kind: Route
+apiVersion: route.openshift.io/v1
+metadata:
+  name: {{ include "llama-stack.fullname" . }}
+  labels:
+    {{- include "llama-stack.labels" . | nindent 4 }}
+  {{- with .Values.route.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  {{- if .Values.route.host }}
+  host: {{ .Values.route.host }}
+  {{- end }}
+  {{- if .Values.route.path }}
+  path: {{ .Values.route.path }}
+  {{- end }}
+  to:
+    kind: Service
+    name: {{ include "llama-stack.fullname" . }}
+    weight: 100
+  port:
+    targetPort: llama-stack
+  {{- if .Values.route.tls.enabled }}
+  tls:
+    termination: {{ .Values.route.tls.termination }}
+    insecureEdgeTerminationPolicy: {{ .Values.route.tls.insecureEdgeTerminationPolicy }}
+  {{- end }}
+  wildcardPolicy: None
+{{- end }}
--- a/charts/llama-stack/templates/service.yaml
+++ b/charts/llama-stack/templates/service.yaml
@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "llama-stack.fullname" . }}
+  labels:
+    {{- include "llama-stack.labels" . | nindent 4 }}
+spec:
+  type: {{ .Values.service.type }}
+  ports:
+    - port: {{ .Values.service.port }}
+      targetPort: http
+      protocol: TCP
+      name: llama-stack
+  selector:
+    {{- include "llama-stack.selectorLabels" . | nindent 4 }}
--- a/charts/llama-stack/templates/serviceaccount.yaml
+++ b/charts/llama-stack/templates/serviceaccount.yaml
@ -0,0 +1,13 @@
+{{- if .Values.serviceAccount.create -}}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ include "llama-stack.serviceAccountName" . }}
+  labels:
+    {{- include "llama-stack.labels" . | nindent 4 }}
+  {{- with .Values.serviceAccount.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
+{{- end }}
--- a/charts/llama-stack/values.yaml
+++ b/charts/llama-stack/values.yaml
@ -0,0 +1,152 @@
+
+# When set to true use the `run.yaml` file in the `files/run.yaml` directory
+customRunConfig: false
+
+# TODO: Currently we are only working for vLLM this should be expanded in the future
+vllm:
+  url: "http://vllm-server"
+  inferenceModel: "llama2-7b-chat"
+  # This is the API key for the VLLM server. It can be set in two ways through a secret:
+  # TODO: Implement this
+  # secret:
+  #   name: vllm-secret
+  #   key: vll
+  # or directly with an api key (should be avoided in production)
+  # apiKey: "xxxxxxxxxxxx"
+
+# https://llama-stack.readthedocs.io/en/latest/distributions/selection.html
+#  Note this is only used if the `image.repository` is not overridden
+distribution: distribution-remote-vllm
+
+telemetry:
+  enabled: false
+  serviceName: "otel-collector.openshift-opentelemetry-operator.svc.cluster.local:4318"
+  sinks: "console,sqlite,otel"
+
+#  Use to allow for other env variables to be passed to the container
+# env:
+#   MY_CUSTOM_ENV_VAR: "my-custom-env-var-value"
+
+replicaCount: 1
+
+# This sets the container image more information can be found here: https://kubernetes.io/docs/concepts/containers/images/
+image:
+  repository: docker.io/llamastack/{{ $.Values.distribution }}
+  # tag: latest
+  # This sets the pull policy for images.
+  pullPolicy: Always
+
+
+# This section builds out the service account more information can be found here: https://kubernetes.io/docs/concepts/security/service-accounts/
+serviceAccount:
+  # Specifies whether a service account should be created
+  create: false
+  # Automatically mount a ServiceAccount's API credentials?
+  automount: true
+  # Annotations to add to the service account
+  annotations: {}
+  # The name of the service account to use.
+  # If not set and create is true, a name is generated using the fullname template
+  name: ""
+
+# This is for setting Kubernetes Annotations to a Pod.
+# For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/
+podAnnotations: {}
+# This is for setting Kubernetes Labels to a Pod.
+# For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
+podLabels: {}
+
+podSecurityContext: {}
+  # fsGroup: 2000
+
+# This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/
+service:
+  # This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types
+  type: ClusterIP
+  # This sets the ports more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#field-spec-ports
+  port: 5001
+
+# Additional volumes on the output Deployment definition.
+volumes: []
+# - name: foo
+#   secret:
+#     secretName: mysecret
+#     optional: false
+
+# Additional volumeMounts on the output Deployment definition.
+volumeMounts: []
+# - name: foo
+#   mountPath: "/etc/foo"
+#   readOnly: true
+
+
+# This block is for setting up the ingress for more information can be found here: https://kubernetes.io/docs/concepts/services-networking/ingress/
+ingress:
+  enabled: true
+  className: ""
+  annotations: {}
+    # kubernetes.io/ingress.class: nginx
+    # kubernetes.io/tls-acme: "true"
+  hosts:
+    - host: chart-example.local
+      paths:
+        - path: /
+          pathType: ImplementationSpecific
+  tls: []
+  #  - secretName: chart-example-tls
+  #    hosts:
+  #      - chart-example.local
+
+
+# -- Enable creation of the OpenShift Route object (This should be used instead of ingress on OpenShift)
+route:
+  enabled: false
+  # Allow OCP to determine the host if left blank
+  # -- The hostname for the route
+  # @default -- Set by OpenShift
+  host: ""
+  # -- The path for the OpenShift route
+  path: ""
+  tls:
+    # -- Enable secure route settings
+    enabled: true
+    # -- Secure route termination policy
+    termination: edge
+    # -- Insecure route termination policy
+    insecureEdgeTerminationPolicy: Redirect
+  # -- Additional custom annotations for the route
+  annotations: {}
+
+
+resources:
+  limits:
+    cpu: 100m
+    memory: 500Mi
+  requests:
+    cpu: 100m
+    memory: 500Mi
+
+# This is to setup the liveness and readiness probes more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/
+livenessProbe:
+  httpGet:
+    path: /v1/health
+    port: 5001
+readinessProbe:
+  httpGet:
+    path: /v1/health
+    port: 5001
+startupProbe:
+  httpGet:
+    path: /v1/health
+    port: 5001
+  initialDelaySeconds: 40
+  periodSeconds: 10
+  failureThreshold: 30
+
+# This section is for setting up autoscaling more information can be found here: https://kubernetes.io/docs/concepts/workloads/autoscaling/
+autoscaling:
+  enabled: false
+  minReplicas: 1
+  maxReplicas: 100
+  targetCPUUtilizationPercentage: 80
+  # targetMemoryUtilizationPercentage: 80