Adding helm chart for deploying llama-stack

This commit is contained in:
Jamie Land 2025-03-18 11:32:40 -04:00
parent bfc79217a8
commit 1bc1f08037
14 changed files with 692 additions and 0 deletions

1
.gitignore vendored
View file

@ -23,3 +23,4 @@ venv/
pytest-report.xml
.coverage
.python-version
**/local-*

23
chart/.helmignore Normal file
View file

@ -0,0 +1,23 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/

24
chart/Chart.yaml Normal file
View file

@ -0,0 +1,24 @@
apiVersion: v2
name: llama-stack
description: Basic chart for deploying llama-stack
# A chart can be either an 'application' or a 'library' chart.
#
# Application charts are a collection of templates that can be packaged into versioned archives
# to be deployed.
#
# Library charts provide useful utilities or functions for the chart developer. They're included as
# a dependency of application charts to inject those utilities and functions into the rendering
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.0
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes.
appVersion: "1.16.0"

85
chart/README.md Normal file
View file

@ -0,0 +1,85 @@
# Llama Stack Helm Chart
This Helm chart is designed to install the Llama Stack, a comprehensive platform for llama-related tasks.
The chart provides a convenient way to deploy and manage the Llama Stack on Kubernetes or OpenShift clusters. It offers flexibility in customizing the deployment by allowing users to modify values such as image repositories, probe configurations, resource limits, and more.
Optionally, the chart also supports the installation of the llama-stack-playground, which provides a web-based interface for interacting with the Llama Stack.
## Quick Start
Create a `local-values.yaml` file with the following:
> **Note**
> Chart currently only supports `vllm` framework directly. But other distributions can be used by modifying the `env` directly.
```yaml
vllm:
url: "https://<MY_VLLM_INSTANCE>:443/v1"
inferenceModel: "meta-llama/Llama-3.1-8B-Instruct"
apiKey: xxxxxxxxxxxxxxxxxxxxxxxxxxxxx
```
Login to Kubernetes through the CLI and run:
```sh
helm upgrade -i ollama-stack . -f local-values.yaml
```
## Values
| Key | Type | Default | Description |
|-----|------|---------|-------------|
| autoscaling.enabled | bool | `false` | |
| autoscaling.maxReplicas | int | `100` | |
| autoscaling.minReplicas | int | `1` | |
| autoscaling.targetCPUUtilizationPercentage | int | `80` | |
| distribution | string | `"distribution-remote-vllm"` | |
| image.pullPolicy | string | `"Always"` | |
| image.repository | string | `"docker.io/llamastack/{{ $.Values.distribution }}"` | |
| image.tag | string | `"0.1.6"` | |
| ingress.annotations | object | `{}` | |
| ingress.className | string | `""` | |
| ingress.enabled | bool | `true` | |
| ingress.hosts[0].host | string | `"chart-example.local"` | |
| ingress.hosts[0].paths[0].path | string | `"/"` | |
| ingress.hosts[0].paths[0].pathType | string | `"ImplementationSpecific"` | |
| ingress.tls | list | `[]` | |
| livenessProbe.httpGet.path | string | `"/v1/health"` | |
| livenessProbe.httpGet.port | int | `5001` | |
| podAnnotations | object | `{}` | |
| podLabels | object | `{}` | |
| podSecurityContext | object | `{}` | |
| readinessProbe.httpGet.path | string | `"/v1/health"` | |
| readinessProbe.httpGet.port | int | `5001` | |
| replicaCount | int | `1` | |
| resources.limits.cpu | string | `"100m"` | |
| resources.limits.memory | string | `"500Mi"` | |
| resources.requests.cpu | string | `"100m"` | |
| resources.requests.memory | string | `"500Mi"` | |
| route | object | `{"annotations":{},"enabled":false,"host":"","path":"","tls":{"enabled":true,"insecureEdgeTerminationPolicy":"Redirect","termination":"edge"}}` | Enable creation of the OpenShift Route object (This should be used instead of ingress on OpenShift) |
| route.annotations | object | `{}` | Additional custom annotations for the route |
| route.host | string | Set by OpenShift | The hostname for the route |
| route.path | string | `""` | The path for the OpenShift route |
| route.tls.enabled | bool | `true` | Enable secure route settings |
| route.tls.insecureEdgeTerminationPolicy | string | `"Redirect"` | Insecure route termination policy |
| route.tls.termination | string | `"edge"` | Secure route termination policy |
| runConfig.enabled | bool | `false` | |
| service.port | int | `5001` | |
| service.type | string | `"ClusterIP"` | |
| serviceAccount.annotations | object | `{}` | |
| serviceAccount.automount | bool | `true` | |
| serviceAccount.create | bool | `false` | |
| serviceAccount.name | string | `""` | |
| startupProbe.failureThreshold | int | `30` | |
| startupProbe.httpGet.path | string | `"/v1/health"` | |
| startupProbe.httpGet.port | int | `5001` | |
| startupProbe.initialDelaySeconds | int | `40` | |
| startupProbe.periodSeconds | int | `10` | |
| telemetry.enabled | bool | `false` | |
| telemetry.serviceName | string | `"otel-collector.openshift-opentelemetry-operator.svc.cluster.local:4318"` | |
| telemetry.sinks | string | `"console,sqlite,otel"` | |
| vllm.inferenceModel | string | `"llama2-7b-chat"` | |
| vllm.url | string | `"http://vllm-server"` | |
| yamlConfig | string | `"/config/run.yaml"` | |

123
chart/files/run.yaml Executable file
View file

@ -0,0 +1,123 @@
version: '2'
image_name: vllm-gpu
apis:
- agents
- datasetio
- eval
- inference
- safety
- scoring
- telemetry
- tool_runtime
- vector_io
providers:
inference:
- provider_id: vllm
provider_type: remote::vllm
config:
url: ${env.VLLM_URL:http://localhost:8000}
api_token: ${env.VLLM_API_TOKEN}
model: ${env.INFERENCE_MODEL:Llama3.2-3B-Instruct}
tensor_parallel_size: ${env.TENSOR_PARALLEL_SIZE:1}
max_tokens: ${env.MAX_TOKENS:4096}
enforce_eager: ${env.ENFORCE_EAGER:False}
gpu_memory_utilization: ${env.GPU_MEMORY_UTILIZATION:0.7}
- provider_id: sentence-transformers
provider_type: inline::sentence-transformers
config: {}
vector_io:
- provider_id: faiss
provider_type: inline::faiss
config:
kvstore:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/faiss_store.db
safety:
- provider_id: llama-guard
provider_type: inline::llama-guard
config: {}
agents:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
persistence_store:
type: sqlite
namespace: null
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/agents_store.db
telemetry:
- provider_id: meta-reference
provider_type: inline::meta-reference
config:
service_name: ${env.OTEL_SERVICE_NAME:llama-stack}
sinks: ${env.TELEMETRY_SINKS:console,sqlite}
sqlite_db_path: ${env.SQLITE_DB_PATH:~/.llama/distributions/vllm-gpu/trace_store.db}
eval:
- provider_id: meta-reference
provider_type: inline::meta-reference
config: {}
datasetio:
- provider_id: huggingface
provider_type: remote::huggingface
config: {}
- provider_id: localfs
provider_type: inline::localfs
config: {}
scoring:
- provider_id: basic
provider_type: inline::basic
config: {}
- provider_id: llm-as-judge
provider_type: inline::llm-as-judge
config: {}
- provider_id: braintrust
provider_type: inline::braintrust
config:
openai_api_key: ${env.OPENAI_API_KEY:}
tool_runtime:
- provider_id: brave-search
provider_type: remote::brave-search
config:
api_key: ${env.BRAVE_SEARCH_API_KEY:}
max_results: 3
- provider_id: tavily-search
provider_type: remote::tavily-search
config:
api_key: ${env.TAVILY_SEARCH_API_KEY:}
max_results: 3
- provider_id: code-interpreter
provider_type: inline::code-interpreter
config: {}
- provider_id: rag-runtime
provider_type: inline::rag-runtime
config: {}
- provider_id: model-context-protocol
provider_type: remote::model-context-protocol
config: {}
metadata_store:
type: sqlite
db_path: ${env.SQLITE_STORE_DIR:~/.llama/distributions/vllm-gpu}/registry.db
models:
- metadata: {}
model_id: ${env.INFERENCE_MODEL}
provider_id: vllm
model_type: llm
- metadata:
embedding_dimension: 384
model_id: all-MiniLM-L6-v2
provider_id: sentence-transformers
model_type: embedding
shields: []
vector_dbs: []
datasets: []
scoring_fns: []
benchmarks: []
tool_groups:
- toolgroup_id: builtin::websearch
provider_id: brave-search
- toolgroup_id: builtin::rag
provider_id: rag-runtime
- toolgroup_id: builtin::code_interpreter
provider_id: code-interpreter
server:
port: 8321

View file

@ -0,0 +1,62 @@
{{/*
Expand the name of the chart.
*/}}
{{- define "llama-stack.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "llama-stack.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "llama-stack.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Common labels
*/}}
{{- define "llama-stack.labels" -}}
helm.sh/chart: {{ include "llama-stack.chart" . }}
{{ include "llama-stack.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}
{{/*
Selector labels
*/}}
{{- define "llama-stack.selectorLabels" -}}
app.kubernetes.io/name: {{ include "llama-stack.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}
{{/*
Create the name of the service account to use
*/}}
{{- define "llama-stack.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }}
{{- default (include "llama-stack.fullname" .) .Values.serviceAccount.name }}
{{- else }}
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
{{- end }}

View file

@ -0,0 +1,7 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "llama-stack.fullname" . }}-run-config
data:
run.yaml: |-
{{- .Files.Get "files/run.yaml" | nindent 4 }}

View file

@ -0,0 +1,91 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "llama-stack.fullname" . }}
labels:
{{- include "llama-stack.labels" . | nindent 4 }}
spec:
replicas: {{ .Values.replicaCount }}
selector:
matchLabels:
{{- include "llama-stack.selectorLabels" . | nindent 6 }}
template:
metadata:
{{- with .Values.podAnnotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
{{- include "llama-stack.labels" . | nindent 8 }}
{{- with .Values.podLabels }}
{{- toYaml . | nindent 8 }}
{{- end }}
spec:
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "llama-stack.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
containers:
- name: {{ .Chart.Name }}
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
image: "{{ (tpl .Values.image.repository $) }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
ports:
- name: http
containerPort: {{ .Values.service.port }}
protocol: TCP
args:
- "--yaml-config"
- "/config/run.yaml"
env:
{{- with .Values.vllm }}
- name: VLLM_URL
value: {{ .url | quote }}
- name: VLLM_API_TOKEN
value: {{ .apiKey | default "" | quote}}
- name: INFERENCE_MODEL
value: {{ .inferenceModel | quote }}
{{- end }}
- name: LLAMA_STACK_PORT
value: {{ .Values.service.port | quote }}
{{- if .Values.telemetry.enabled }}
- name: TELEMETRY_SINKS
value: {{ .Values.telemetry.sinks | quote }}
- name: OTEL_SERVICE_NAME
value: {{ .Values.telemetry.serviceName | quote }}
{{- end }}
{{- with .Values.env }}
{{- toYaml . | nindent 12 }}
{{- end }}
livenessProbe:
{{- tpl (toYaml .Values.livenessProbe) $ | nindent 12 }}
readinessProbe:
{{- tpl (toYaml .Values.readinessProbe) $ | nindent 12 }}
startupProbe:
{{- tpl (toYaml .Values.startupProbe) $ | nindent 12 }}
resources:
{{- toYaml .Values.resources | nindent 12 }}
volumeMounts:
- name: config-volume
mountPath: /config
volumes:
- name: config-volume
configMap:
name: {{ include "llama-stack.fullname" . }}-run-config
defaultMode: 0755
{{- with .Values.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}

32
chart/templates/hpa.yaml Normal file
View file

@ -0,0 +1,32 @@
{{- if .Values.autoscaling.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: {{ include "llama-stack.fullname" . }}
labels:
{{- include "llama-stack.labels" . | nindent 4 }}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: {{ include "llama-stack.fullname" . }}
minReplicas: {{ .Values.autoscaling.minReplicas }}
maxReplicas: {{ .Values.autoscaling.maxReplicas }}
metrics:
{{- if .Values.autoscaling.targetCPUUtilizationPercentage }}
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
{{- end }}
{{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
- type: Resource
resource:
name: memory
target:
type: Utilization
averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
{{- end }}
{{- end }}

View file

@ -0,0 +1,43 @@
{{- if .Values.ingress.enabled -}}
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: {{ include "llama-stack.fullname" . }}
labels:
{{- include "llama-stack.labels" . | nindent 4 }}
{{- with .Values.ingress.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
{{- with .Values.ingress.className }}
ingressClassName: {{ . }}
{{- end }}
{{- if .Values.ingress.tls }}
tls:
{{- range .Values.ingress.tls }}
- hosts:
{{- range .hosts }}
- {{ . | quote }}
{{- end }}
secretName: {{ .secretName }}
{{- end }}
{{- end }}
rules:
{{- range .Values.ingress.hosts }}
- host: {{ .host | quote }}
http:
paths:
{{- range .paths }}
- path: {{ .path }}
{{- with .pathType }}
pathType: {{ . }}
{{- end }}
backend:
service:
name: {{ include "llama-stack.fullname" $ }}
port:
number: {{ $.Values.service.port }}
{{- end }}
{{- end }}
{{- end }}

View file

@ -0,0 +1,31 @@
{{- if .Values.route.enabled -}}
kind: Route
apiVersion: route.openshift.io/v1
metadata:
name: {{ include "llama-stack.fullname" . }}
labels:
{{- include "llama-stack.labels" . | nindent 4 }}
{{- with .Values.route.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
{{- if .Values.route.host }}
host: {{ .Values.route.host }}
{{- end }}
{{- if .Values.route.path }}
path: {{ .Values.route.path }}
{{- end }}
to:
kind: Service
name: {{ include "llama-stack.fullname" . }}
weight: 100
port:
targetPort: llama-stack
{{- if .Values.route.tls.enabled }}
tls:
termination: {{ .Values.route.tls.termination }}
insecureEdgeTerminationPolicy: {{ .Values.route.tls.insecureEdgeTerminationPolicy }}
{{- end }}
wildcardPolicy: None
{{- end }}

View file

@ -0,0 +1,15 @@
apiVersion: v1
kind: Service
metadata:
name: {{ include "llama-stack.fullname" . }}
labels:
{{- include "llama-stack.labels" . | nindent 4 }}
spec:
type: {{ .Values.service.type }}
ports:
- port: {{ .Values.service.port }}
targetPort: http
protocol: TCP
name: llama-stack
selector:
{{- include "llama-stack.selectorLabels" . | nindent 4 }}

View file

@ -0,0 +1,13 @@
{{- if .Values.serviceAccount.create -}}
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "llama-stack.serviceAccountName" . }}
labels:
{{- include "llama-stack.labels" . | nindent 4 }}
{{- with .Values.serviceAccount.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
automountServiceAccountToken: {{ .Values.serviceAccount.automount }}
{{- end }}

142
chart/values.yaml Normal file
View file

@ -0,0 +1,142 @@
# yamlConfig: "/config/run.yaml"
# TODO: Currently we are only working for vLLM this should be expanded in the future
vllm:
url: "http://vllm-server"
inferenceModel: "llama2-7b-chat"
# This is the API key for the VLLM server. It can be set in two ways through a secret:
# TODO: Implement this
# secret:
# name: vllm-secret
# key: vll
# or directly with an api key (should be avoided in production)
# apiKey: "xxxxxxxxxxxx"
# https://llama-stack.readthedocs.io/en/latest/distributions/selection.html
distribution: distribution-remote-vllm
runConfig:
enabled: false
# customYaml:
# Your custom run.yaml configuration file can be pasted here
# If not set, the default run.yaml file in the `files/run.yaml` will be used
telemetry:
enabled: false
serviceName: "otel-collector.openshift-opentelemetry-operator.svc.cluster.local:4318"
sinks: "console,sqlite,otel"
# Use to allow for other env variables to be passed to the container
# env:
# MY_CUSTOM_ENV_VAR: "my-custom-env-var-value"
replicaCount: 1
# This sets the container image more information can be found here: https://kubernetes.io/docs/concepts/containers/images/
image:
repository: docker.io/llamastack/{{ $.Values.distribution }}
tag: 0.1.6
# This sets the pull policy for images.
pullPolicy: Always
# This section builds out the service account more information can be found here: https://kubernetes.io/docs/concepts/security/service-accounts/
serviceAccount:
# Specifies whether a service account should be created
create: false
# Automatically mount a ServiceAccount's API credentials?
automount: true
# Annotations to add to the service account
annotations: {}
# The name of the service account to use.
# If not set and create is true, a name is generated using the fullname template
name: ""
# This is for setting Kubernetes Annotations to a Pod.
# For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/
podAnnotations: {}
# This is for setting Kubernetes Labels to a Pod.
# For more information checkout: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
podLabels: {}
podSecurityContext: {}
# fsGroup: 2000
# This is for setting up a service more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/
service:
# This sets the service type more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#publishing-services-service-types
type: ClusterIP
# This sets the ports more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#field-spec-ports
port: 5001
# This block is for setting up the ingress for more information can be found here: https://kubernetes.io/docs/concepts/services-networking/ingress/
ingress:
enabled: true
className: ""
annotations: {}
# kubernetes.io/ingress.class: nginx
# kubernetes.io/tls-acme: "true"
hosts:
- host: chart-example.local
paths:
- path: /
pathType: ImplementationSpecific
tls: []
# - secretName: chart-example-tls
# hosts:
# - chart-example.local
# -- Enable creation of the OpenShift Route object (This should be used instead of ingress on OpenShift)
route:
enabled: false
# Allow OCP to determine the host if left blank
# -- The hostname for the route
# @default -- Set by OpenShift
host: ""
# -- The path for the OpenShift route
path: ""
tls:
# -- Enable secure route settings
enabled: true
# -- Secure route termination policy
termination: edge
# -- Insecure route termination policy
insecureEdgeTerminationPolicy: Redirect
# -- Additional custom annotations for the route
annotations: {}
resources:
limits:
cpu: 100m
memory: 500Mi
requests:
cpu: 100m
memory: 500Mi
# This is to setup the liveness and readiness probes more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/
livenessProbe:
httpGet:
path: /v1/health
port: 5001
readinessProbe:
httpGet:
path: /v1/health
port: 5001
startupProbe:
httpGet:
path: /v1/health
port: 5001
initialDelaySeconds: 40
periodSeconds: 10
failureThreshold: 30
# This section is for setting up autoscaling more information can be found here: https://kubernetes.io/docs/concepts/workloads/autoscaling/
autoscaling:
enabled: false
minReplicas: 1
maxReplicas: 100
targetCPUUtilizationPercentage: 80
# targetMemoryUtilizationPercentage: 80