From 8a61e028da55479572061942231f484c191fc0b2 Mon Sep 17 00:00:00 2001 From: Jamie Land Date: Tue, 18 Mar 2025 12:13:02 -0400 Subject: [PATCH] Readme Updates --- chart/README.md | 127 ++++++++++++++++++-------------- chart/templates/config.yaml | 2 + chart/templates/deployment.yaml | 12 +++ chart/values.yaml | 17 ++++- 4 files changed, 102 insertions(+), 56 deletions(-) diff --git a/chart/README.md b/chart/README.md index eb319924e..cbd704078 100644 --- a/chart/README.md +++ b/chart/README.md @@ -12,9 +12,12 @@ Optionally, the chart also supports the installation of the llama-stack-playgrou Create a `local-values.yaml` file with the following: > **Note** -> Chart currently only supports `vllm` framework directly. But other distributions can be used by modifying the `env` directly. +> Chart currently only supports `vllm` framework directly. But other distributions can managed by adding to the `env` inside the values file directly. ```yaml + +distribution: distribution-remote-vllm + vllm: url: "https://:443/v1" inferenceModel: "meta-llama/Llama-3.1-8B-Instruct" @@ -27,59 +30,73 @@ Login to Kubernetes through the CLI and run: helm upgrade -i ollama-stack . -f local-values.yaml ``` +## Custom Configuration + +By default llama-stack will use the run.yaml config that comes with the specified distribution. For more granular control the `customRunConfig` can be set to true, in which case the helm chart will use the values inside of the `files/run.yaml` instead. + ## Values -| Key | Type | Default | Description | -|-----|------|---------|-------------| -| autoscaling.enabled | bool | `false` | | -| autoscaling.maxReplicas | int | `100` | | -| autoscaling.minReplicas | int | `1` | | -| autoscaling.targetCPUUtilizationPercentage | int | `80` | | -| distribution | string | `"distribution-remote-vllm"` | | -| image.pullPolicy | string | `"Always"` | | -| image.repository | string | `"docker.io/llamastack/{{ $.Values.distribution }}"` | | -| image.tag | string | `"0.1.6"` | | -| ingress.annotations | object | `{}` | | -| ingress.className | string | `""` | | -| ingress.enabled | bool | `true` | | -| ingress.hosts[0].host | string | `"chart-example.local"` | | -| ingress.hosts[0].paths[0].path | string | `"/"` | | -| ingress.hosts[0].paths[0].pathType | string | `"ImplementationSpecific"` | | -| ingress.tls | list | `[]` | | -| livenessProbe.httpGet.path | string | `"/v1/health"` | | -| livenessProbe.httpGet.port | int | `5001` | | -| podAnnotations | object | `{}` | | -| podLabels | object | `{}` | | -| podSecurityContext | object | `{}` | | -| readinessProbe.httpGet.path | string | `"/v1/health"` | | -| readinessProbe.httpGet.port | int | `5001` | | -| replicaCount | int | `1` | | -| resources.limits.cpu | string | `"100m"` | | -| resources.limits.memory | string | `"500Mi"` | | -| resources.requests.cpu | string | `"100m"` | | -| resources.requests.memory | string | `"500Mi"` | | -| route | object | `{"annotations":{},"enabled":false,"host":"","path":"","tls":{"enabled":true,"insecureEdgeTerminationPolicy":"Redirect","termination":"edge"}}` | Enable creation of the OpenShift Route object (This should be used instead of ingress on OpenShift) | -| route.annotations | object | `{}` | Additional custom annotations for the route | -| route.host | string | Set by OpenShift | The hostname for the route | -| route.path | string | `""` | The path for the OpenShift route | -| route.tls.enabled | bool | `true` | Enable secure route settings | -| route.tls.insecureEdgeTerminationPolicy | string | `"Redirect"` | Insecure route termination policy | -| route.tls.termination | string | `"edge"` | Secure route termination policy | -| runConfig.enabled | bool | `false` | | -| service.port | int | `5001` | | -| service.type | string | `"ClusterIP"` | | -| serviceAccount.annotations | object | `{}` | | -| serviceAccount.automount | bool | `true` | | -| serviceAccount.create | bool | `false` | | -| serviceAccount.name | string | `""` | | -| startupProbe.failureThreshold | int | `30` | | -| startupProbe.httpGet.path | string | `"/v1/health"` | | -| startupProbe.httpGet.port | int | `5001` | | -| startupProbe.initialDelaySeconds | int | `40` | | -| startupProbe.periodSeconds | int | `10` | | -| telemetry.enabled | bool | `false` | | -| telemetry.serviceName | string | `"otel-collector.openshift-opentelemetry-operator.svc.cluster.local:4318"` | | -| telemetry.sinks | string | `"console,sqlite,otel"` | | -| vllm.inferenceModel | string | `"llama2-7b-chat"` | | -| vllm.url | string | `"http://vllm-server"` | | -| yamlConfig | string | `"/config/run.yaml"` | | +### Llama Stack Specific + +| Key | Type | Default | Description | +| :---------------------- | :------- | :------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------ | +| `customRunConfig` | `bool` | `false` | Indicates whether a custom run configuration is being used. | +| `distribution` | `string` | `"distribution-remote-vllm"` | Specifies the distribution or type of deployment being used (in this case, related to a remote vLLM distribution). | +| `telemetry.enabled` | `bool` | `false` | Enables or disables telemetry collection. | +| `telemetry.serviceName` | `string` | `"otel-collector.openshift-opentelemetry-operator.svc.cluster.local:4318"` | The service name and address of the telemetry collector. | +| `telemetry.sinks` | `string` | `"console,sqlite,otel"` | Specifies the destinations or sinks where telemetry data will be sent. | +| `vllm.inferenceModel` | `string` | `"llama2-7b-chat"` | The specific inference model to be used by vLLM (a high-throughput and memory-efficient inference service for large language models). | +| `vllm.url` | `string` | `"http://vllm-server"` | The URL of the vLLM service. | +| `env` | `object` | N/A | A set of key/value pairs that can be set in the pod | + +### General + +| Key | Type | Default | Description | +| :----------------------------------------- | :----- | :----------------------------------------------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `autoscaling.enabled` | `bool` | `false` | Enables or disables horizontal pod autoscaling, which automatically adjusts the number of running instances based on CPU utilization. | +| `autoscaling.maxReplicas` | `int` | `100` | The maximum number of pod replicas that the autoscaler can scale up to. | +| `autoscaling.minReplicas` | `int` | `1` | The minimum number of pod replicas that will always be running. | +| `autoscaling.targetCPUUtilizationPercentage` | `int` | `80` | The target average CPU utilization across all running pods that the autoscaler will aim to maintain. | +| `image.pullPolicy` | `string` | `"Always"` | Defines when to pull the Docker image for the container (e.g., always pull, pull if not present, etc.). | +| `image.repository` | `string` | `"docker.io/llamastack/{{ $.Values.distribution }}"` | The Docker image repository where the container image is located. It likely uses the `distribution` value to construct the full image path. | +| `image.tag` | `string` | `"0.1.6"` | The specific version tag of the Docker image to use. | +| `ingress.annotations` | `object` | `{}` | Kubernetes Ingress annotations, which can be used to configure load balancers and other external access settings. | +| `ingress.className` | `string` | `""` | The name of the Ingress controller to use for this Ingress resource. | +| `ingress.enabled` | `bool` | `true` | Enables or disables the creation of a Kubernetes Ingress resource, which allows external access to the application. | +| `ingress.hosts[0].host` | `string` | `"chart-example.local"` | The hostname that the Ingress will route traffic to. This is often a placeholder or example. | +| `ingress.hosts[0].paths[0].path` | `string` | `"/"` | The path on the specified host that the Ingress will route traffic to (in this case, the root path). | +| `ingress.hosts[0].paths[0].pathType` | `string` | `"ImplementationSpecific"` | The type of path matching used by the Ingress controller. | +| `ingress.tls` | `list` | `[]` | Configuration for Transport Layer Security (TLS) termination at the Ingress, allowing for HTTPS. | +| `livenessProbe.httpGet.path` | `string` | `"/v1/health"` | The HTTP endpoint path that the liveness probe will check to determine if the container is running and healthy. | +| `livenessProbe.httpGet.port` | `int` | `5001` | The port that the liveness probe will connect to for the HTTP health check. | +| `podAnnotations` | `object` | `{}` | Kubernetes Pod annotations, which can be used to attach arbitrary non-identifying metadata to the Pod. | +| `podLabels` | `object` | `{}` | Kubernetes Pod labels, which are key/value pairs that are attached to Pods and can be used for organizing and selecting groups of Pods. | +| `podSecurityContext` | `object` | `{}` | Defines the security context for the Pod, such as user and group IDs, security capabilities, etc. | +| `readinessProbe.httpGet.path` | `string` | `"/v1/health"` | The HTTP endpoint path that the readiness probe will check to determine if the container is ready to serve traffic. | +| `readinessProbe.httpGet.port` | `int` | `5001` | The port that the readiness probe will connect to for the HTTP readiness check. | +| `replicaCount` | `int` | `1` | The desired number of pod replicas to run. | +| `resources.limits.cpu` | `string` | `"100m"` | The maximum amount of CPU resources that a container can use (in millicores). | +| `resources.limits.memory` | `string` | `"500Mi"` | The maximum amount of memory that a container can use (in megabytes). | +| `resources.requests.cpu` | `string` | `"100m"` | The amount of CPU resources that Kubernetes will guarantee to be available for the container. | +| `resources.requests.memory` | `string` | `"500Mi"` | The amount of memory that Kubernetes will guarantee to be available for the container (in megabytes). | +| `route` | `object` | `{"annotations":{},"enabled":false,"host":"","path":"","tls":{"enabled":true,"insecureEdgeTerminationPolicy":"Redirect","termination":"edge"}}` | Configuration for an OpenShift Route object, which is used for exposing services externally on OpenShift. | +| `route.annotations` | `object` | `{}` | Additional custom annotations for the OpenShift Route object. | +| `route.host` | `string` | `Set by OpenShift` | The hostname for the OpenShift Route. This is typically managed by OpenShift. | +| `route.path` | `string` | `""` | The path for the OpenShift Route. | +| `route.tls.enabled` | `bool` | `true` | Enables or disables TLS for the OpenShift Route, providing secure communication. | +| `route.tls.insecureEdgeTerminationPolicy` | `string` | `"Redirect"` | The policy for handling insecure (HTTP) requests when TLS termination is at the edge (Route). | +| `route.tls.termination` | `string` | `"edge"` | Specifies that TLS termination occurs at the OpenShift Route edge. | +| `runConfig.enabled` | `bool` | `false` | Indicates whether a specific run configuration is enabled. | +| `service.port` | `int` | `5001` | The port on which the Kubernetes Service will be exposed internally within the cluster. | +| `service.type` | `string` | `"ClusterIP"` | The type of Kubernetes Service. `ClusterIP` makes the service only reachable from within the cluster. | +| `serviceAccount.annotations` | `object` | `{}` | Annotations for the Kubernetes ServiceAccount. | +| `serviceAccount.automount` | `bool` | `true` | Indicates whether the ServiceAccount token should be automatically mounted into the Pods. | +| `serviceAccount.create` | `bool` | `false` | Determines whether a new Kubernetes ServiceAccount should be created. | +| `serviceAccount.name` | `string` | `""` | The name of an existing Kubernetes ServiceAccount to use. If `create` is true and this is empty, a default name will be generated. | +| `startupProbe.failureThreshold` | `int` | `30` | The number of consecutive failures of the startup probe before Kubernetes considers the container failed to start. | +| `startupProbe.httpGet.path` | `string` | `"/v1/health"` | The HTTP endpoint path for the startup probe, used to determine if the application has started successfully. | +| `startupProbe.httpGet.port` | `int` | `5001` | The port for the HTTP startup probe. | +| `startupProbe.initialDelaySeconds` | `int` | `40` | The number of seconds to wait after the container has started before the startup probe is first initiated. | +| `startupProbe.periodSeconds` | `int` | `10` | The interval (in seconds) at which the startup probe will be executed. | +| `volumeMounts` | `list` | `[]` | A list of volume mounts that define how volumes should be mounted into the container's filesystem. | +| `volumes` | `list` | `[]` | A list of volume definitions that provide storage for the Pod. | diff --git a/chart/templates/config.yaml b/chart/templates/config.yaml index 62e18272c..f7f7c6373 100644 --- a/chart/templates/config.yaml +++ b/chart/templates/config.yaml @@ -1,3 +1,4 @@ +{{- if .Values.customRunConfig }} apiVersion: v1 kind: ConfigMap metadata: @@ -5,3 +6,4 @@ metadata: data: run.yaml: |- {{- .Files.Get "files/run.yaml" | nindent 4 }} +{{- end }} \ No newline at end of file diff --git a/chart/templates/deployment.yaml b/chart/templates/deployment.yaml index 7f2a0ea2f..59488ffc5 100644 --- a/chart/templates/deployment.yaml +++ b/chart/templates/deployment.yaml @@ -38,9 +38,11 @@ spec: - name: http containerPort: {{ .Values.service.port }} protocol: TCP + {{- if .Values.customRunConfig }} args: - "--yaml-config" - "/config/run.yaml" + {{- end }} env: {{- with .Values.vllm }} - name: VLLM_URL @@ -69,14 +71,24 @@ spec: {{- tpl (toYaml .Values.startupProbe) $ | nindent 12 }} resources: {{- toYaml .Values.resources | nindent 12 }} + {{- if or .Values.customRunConfig .Values.volumeMounts }} volumeMounts: - name: config-volume mountPath: /config + {{- with .Values.volumeMounts }} + {{- toYaml . | nindent 12 }} + {{- end }} + {{- end }} + {{- if or .Values.customRunConfig .Values.volumes }} volumes: - name: config-volume configMap: name: {{ include "llama-stack.fullname" . }}-run-config defaultMode: 0755 + {{- with .Values.volumes }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} {{- with .Values.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/chart/values.yaml b/chart/values.yaml index 65e0ce61c..cbd359c60 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -1,4 +1,6 @@ -# yamlConfig: "/config/run.yaml" + +# When set to true use the `run.yaml` file in the `files/run.yaml` directory +customRunConfig: false # TODO: Currently we are only working for vLLM this should be expanded in the future vllm: @@ -69,6 +71,19 @@ service: # This sets the ports more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#field-spec-ports port: 5001 +# Additional volumes on the output Deployment definition. +volumes: [] +# - name: foo +# secret: +# secretName: mysecret +# optional: false + +# Additional volumeMounts on the output Deployment definition. +volumeMounts: [] +# - name: foo +# mountPath: "/etc/foo" +# readOnly: true + # This block is for setting up the ingress for more information can be found here: https://kubernetes.io/docs/concepts/services-networking/ingress/ ingress: