mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-06 10:42:39 +00:00
Readme Updates
This commit is contained in:
parent
1bc1f08037
commit
8a61e028da
4 changed files with 102 additions and 56 deletions
127
chart/README.md
127
chart/README.md
|
@ -12,9 +12,12 @@ Optionally, the chart also supports the installation of the llama-stack-playgrou
|
|||
Create a `local-values.yaml` file with the following:
|
||||
|
||||
> **Note**
|
||||
> Chart currently only supports `vllm` framework directly. But other distributions can be used by modifying the `env` directly.
|
||||
> Chart currently only supports `vllm` framework directly. But other distributions can managed by adding to the `env` inside the values file directly.
|
||||
|
||||
```yaml
|
||||
|
||||
distribution: distribution-remote-vllm
|
||||
|
||||
vllm:
|
||||
url: "https://<MY_VLLM_INSTANCE>:443/v1"
|
||||
inferenceModel: "meta-llama/Llama-3.1-8B-Instruct"
|
||||
|
@ -27,59 +30,73 @@ Login to Kubernetes through the CLI and run:
|
|||
helm upgrade -i ollama-stack . -f local-values.yaml
|
||||
```
|
||||
|
||||
## Custom Configuration
|
||||
|
||||
By default llama-stack will use the run.yaml config that comes with the specified distribution. For more granular control the `customRunConfig` can be set to true, in which case the helm chart will use the values inside of the `files/run.yaml` instead.
|
||||
|
||||
## Values
|
||||
|
||||
| Key | Type | Default | Description |
|
||||
|-----|------|---------|-------------|
|
||||
| autoscaling.enabled | bool | `false` | |
|
||||
| autoscaling.maxReplicas | int | `100` | |
|
||||
| autoscaling.minReplicas | int | `1` | |
|
||||
| autoscaling.targetCPUUtilizationPercentage | int | `80` | |
|
||||
| distribution | string | `"distribution-remote-vllm"` | |
|
||||
| image.pullPolicy | string | `"Always"` | |
|
||||
| image.repository | string | `"docker.io/llamastack/{{ $.Values.distribution }}"` | |
|
||||
| image.tag | string | `"0.1.6"` | |
|
||||
| ingress.annotations | object | `{}` | |
|
||||
| ingress.className | string | `""` | |
|
||||
| ingress.enabled | bool | `true` | |
|
||||
| ingress.hosts[0].host | string | `"chart-example.local"` | |
|
||||
| ingress.hosts[0].paths[0].path | string | `"/"` | |
|
||||
| ingress.hosts[0].paths[0].pathType | string | `"ImplementationSpecific"` | |
|
||||
| ingress.tls | list | `[]` | |
|
||||
| livenessProbe.httpGet.path | string | `"/v1/health"` | |
|
||||
| livenessProbe.httpGet.port | int | `5001` | |
|
||||
| podAnnotations | object | `{}` | |
|
||||
| podLabels | object | `{}` | |
|
||||
| podSecurityContext | object | `{}` | |
|
||||
| readinessProbe.httpGet.path | string | `"/v1/health"` | |
|
||||
| readinessProbe.httpGet.port | int | `5001` | |
|
||||
| replicaCount | int | `1` | |
|
||||
| resources.limits.cpu | string | `"100m"` | |
|
||||
| resources.limits.memory | string | `"500Mi"` | |
|
||||
| resources.requests.cpu | string | `"100m"` | |
|
||||
| resources.requests.memory | string | `"500Mi"` | |
|
||||
| route | object | `{"annotations":{},"enabled":false,"host":"","path":"","tls":{"enabled":true,"insecureEdgeTerminationPolicy":"Redirect","termination":"edge"}}` | Enable creation of the OpenShift Route object (This should be used instead of ingress on OpenShift) |
|
||||
| route.annotations | object | `{}` | Additional custom annotations for the route |
|
||||
| route.host | string | Set by OpenShift | The hostname for the route |
|
||||
| route.path | string | `""` | The path for the OpenShift route |
|
||||
| route.tls.enabled | bool | `true` | Enable secure route settings |
|
||||
| route.tls.insecureEdgeTerminationPolicy | string | `"Redirect"` | Insecure route termination policy |
|
||||
| route.tls.termination | string | `"edge"` | Secure route termination policy |
|
||||
| runConfig.enabled | bool | `false` | |
|
||||
| service.port | int | `5001` | |
|
||||
| service.type | string | `"ClusterIP"` | |
|
||||
| serviceAccount.annotations | object | `{}` | |
|
||||
| serviceAccount.automount | bool | `true` | |
|
||||
| serviceAccount.create | bool | `false` | |
|
||||
| serviceAccount.name | string | `""` | |
|
||||
| startupProbe.failureThreshold | int | `30` | |
|
||||
| startupProbe.httpGet.path | string | `"/v1/health"` | |
|
||||
| startupProbe.httpGet.port | int | `5001` | |
|
||||
| startupProbe.initialDelaySeconds | int | `40` | |
|
||||
| startupProbe.periodSeconds | int | `10` | |
|
||||
| telemetry.enabled | bool | `false` | |
|
||||
| telemetry.serviceName | string | `"otel-collector.openshift-opentelemetry-operator.svc.cluster.local:4318"` | |
|
||||
| telemetry.sinks | string | `"console,sqlite,otel"` | |
|
||||
| vllm.inferenceModel | string | `"llama2-7b-chat"` | |
|
||||
| vllm.url | string | `"http://vllm-server"` | |
|
||||
| yamlConfig | string | `"/config/run.yaml"` | |
|
||||
### Llama Stack Specific
|
||||
|
||||
| Key | Type | Default | Description |
|
||||
| :---------------------- | :------- | :------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `customRunConfig` | `bool` | `false` | Indicates whether a custom run configuration is being used. |
|
||||
| `distribution` | `string` | `"distribution-remote-vllm"` | Specifies the distribution or type of deployment being used (in this case, related to a remote vLLM distribution). |
|
||||
| `telemetry.enabled` | `bool` | `false` | Enables or disables telemetry collection. |
|
||||
| `telemetry.serviceName` | `string` | `"otel-collector.openshift-opentelemetry-operator.svc.cluster.local:4318"` | The service name and address of the telemetry collector. |
|
||||
| `telemetry.sinks` | `string` | `"console,sqlite,otel"` | Specifies the destinations or sinks where telemetry data will be sent. |
|
||||
| `vllm.inferenceModel` | `string` | `"llama2-7b-chat"` | The specific inference model to be used by vLLM (a high-throughput and memory-efficient inference service for large language models). |
|
||||
| `vllm.url` | `string` | `"http://vllm-server"` | The URL of the vLLM service. |
|
||||
| `env` | `object` | N/A | A set of key/value pairs that can be set in the pod |
|
||||
|
||||
### General
|
||||
|
||||
| Key | Type | Default | Description |
|
||||
| :----------------------------------------- | :----- | :----------------------------------------------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `autoscaling.enabled` | `bool` | `false` | Enables or disables horizontal pod autoscaling, which automatically adjusts the number of running instances based on CPU utilization. |
|
||||
| `autoscaling.maxReplicas` | `int` | `100` | The maximum number of pod replicas that the autoscaler can scale up to. |
|
||||
| `autoscaling.minReplicas` | `int` | `1` | The minimum number of pod replicas that will always be running. |
|
||||
| `autoscaling.targetCPUUtilizationPercentage` | `int` | `80` | The target average CPU utilization across all running pods that the autoscaler will aim to maintain. |
|
||||
| `image.pullPolicy` | `string` | `"Always"` | Defines when to pull the Docker image for the container (e.g., always pull, pull if not present, etc.). |
|
||||
| `image.repository` | `string` | `"docker.io/llamastack/{{ $.Values.distribution }}"` | The Docker image repository where the container image is located. It likely uses the `distribution` value to construct the full image path. |
|
||||
| `image.tag` | `string` | `"0.1.6"` | The specific version tag of the Docker image to use. |
|
||||
| `ingress.annotations` | `object` | `{}` | Kubernetes Ingress annotations, which can be used to configure load balancers and other external access settings. |
|
||||
| `ingress.className` | `string` | `""` | The name of the Ingress controller to use for this Ingress resource. |
|
||||
| `ingress.enabled` | `bool` | `true` | Enables or disables the creation of a Kubernetes Ingress resource, which allows external access to the application. |
|
||||
| `ingress.hosts[0].host` | `string` | `"chart-example.local"` | The hostname that the Ingress will route traffic to. This is often a placeholder or example. |
|
||||
| `ingress.hosts[0].paths[0].path` | `string` | `"/"` | The path on the specified host that the Ingress will route traffic to (in this case, the root path). |
|
||||
| `ingress.hosts[0].paths[0].pathType` | `string` | `"ImplementationSpecific"` | The type of path matching used by the Ingress controller. |
|
||||
| `ingress.tls` | `list` | `[]` | Configuration for Transport Layer Security (TLS) termination at the Ingress, allowing for HTTPS. |
|
||||
| `livenessProbe.httpGet.path` | `string` | `"/v1/health"` | The HTTP endpoint path that the liveness probe will check to determine if the container is running and healthy. |
|
||||
| `livenessProbe.httpGet.port` | `int` | `5001` | The port that the liveness probe will connect to for the HTTP health check. |
|
||||
| `podAnnotations` | `object` | `{}` | Kubernetes Pod annotations, which can be used to attach arbitrary non-identifying metadata to the Pod. |
|
||||
| `podLabels` | `object` | `{}` | Kubernetes Pod labels, which are key/value pairs that are attached to Pods and can be used for organizing and selecting groups of Pods. |
|
||||
| `podSecurityContext` | `object` | `{}` | Defines the security context for the Pod, such as user and group IDs, security capabilities, etc. |
|
||||
| `readinessProbe.httpGet.path` | `string` | `"/v1/health"` | The HTTP endpoint path that the readiness probe will check to determine if the container is ready to serve traffic. |
|
||||
| `readinessProbe.httpGet.port` | `int` | `5001` | The port that the readiness probe will connect to for the HTTP readiness check. |
|
||||
| `replicaCount` | `int` | `1` | The desired number of pod replicas to run. |
|
||||
| `resources.limits.cpu` | `string` | `"100m"` | The maximum amount of CPU resources that a container can use (in millicores). |
|
||||
| `resources.limits.memory` | `string` | `"500Mi"` | The maximum amount of memory that a container can use (in megabytes). |
|
||||
| `resources.requests.cpu` | `string` | `"100m"` | The amount of CPU resources that Kubernetes will guarantee to be available for the container. |
|
||||
| `resources.requests.memory` | `string` | `"500Mi"` | The amount of memory that Kubernetes will guarantee to be available for the container (in megabytes). |
|
||||
| `route` | `object` | `{"annotations":{},"enabled":false,"host":"","path":"","tls":{"enabled":true,"insecureEdgeTerminationPolicy":"Redirect","termination":"edge"}}` | Configuration for an OpenShift Route object, which is used for exposing services externally on OpenShift. |
|
||||
| `route.annotations` | `object` | `{}` | Additional custom annotations for the OpenShift Route object. |
|
||||
| `route.host` | `string` | `Set by OpenShift` | The hostname for the OpenShift Route. This is typically managed by OpenShift. |
|
||||
| `route.path` | `string` | `""` | The path for the OpenShift Route. |
|
||||
| `route.tls.enabled` | `bool` | `true` | Enables or disables TLS for the OpenShift Route, providing secure communication. |
|
||||
| `route.tls.insecureEdgeTerminationPolicy` | `string` | `"Redirect"` | The policy for handling insecure (HTTP) requests when TLS termination is at the edge (Route). |
|
||||
| `route.tls.termination` | `string` | `"edge"` | Specifies that TLS termination occurs at the OpenShift Route edge. |
|
||||
| `runConfig.enabled` | `bool` | `false` | Indicates whether a specific run configuration is enabled. |
|
||||
| `service.port` | `int` | `5001` | The port on which the Kubernetes Service will be exposed internally within the cluster. |
|
||||
| `service.type` | `string` | `"ClusterIP"` | The type of Kubernetes Service. `ClusterIP` makes the service only reachable from within the cluster. |
|
||||
| `serviceAccount.annotations` | `object` | `{}` | Annotations for the Kubernetes ServiceAccount. |
|
||||
| `serviceAccount.automount` | `bool` | `true` | Indicates whether the ServiceAccount token should be automatically mounted into the Pods. |
|
||||
| `serviceAccount.create` | `bool` | `false` | Determines whether a new Kubernetes ServiceAccount should be created. |
|
||||
| `serviceAccount.name` | `string` | `""` | The name of an existing Kubernetes ServiceAccount to use. If `create` is true and this is empty, a default name will be generated. |
|
||||
| `startupProbe.failureThreshold` | `int` | `30` | The number of consecutive failures of the startup probe before Kubernetes considers the container failed to start. |
|
||||
| `startupProbe.httpGet.path` | `string` | `"/v1/health"` | The HTTP endpoint path for the startup probe, used to determine if the application has started successfully. |
|
||||
| `startupProbe.httpGet.port` | `int` | `5001` | The port for the HTTP startup probe. |
|
||||
| `startupProbe.initialDelaySeconds` | `int` | `40` | The number of seconds to wait after the container has started before the startup probe is first initiated. |
|
||||
| `startupProbe.periodSeconds` | `int` | `10` | The interval (in seconds) at which the startup probe will be executed. |
|
||||
| `volumeMounts` | `list` | `[]` | A list of volume mounts that define how volumes should be mounted into the container's filesystem. |
|
||||
| `volumes` | `list` | `[]` | A list of volume definitions that provide storage for the Pod. |
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
{{- if .Values.customRunConfig }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
|
@ -5,3 +6,4 @@ metadata:
|
|||
data:
|
||||
run.yaml: |-
|
||||
{{- .Files.Get "files/run.yaml" | nindent 4 }}
|
||||
{{- end }}
|
|
@ -38,9 +38,11 @@ spec:
|
|||
- name: http
|
||||
containerPort: {{ .Values.service.port }}
|
||||
protocol: TCP
|
||||
{{- if .Values.customRunConfig }}
|
||||
args:
|
||||
- "--yaml-config"
|
||||
- "/config/run.yaml"
|
||||
{{- end }}
|
||||
env:
|
||||
{{- with .Values.vllm }}
|
||||
- name: VLLM_URL
|
||||
|
@ -69,14 +71,24 @@ spec:
|
|||
{{- tpl (toYaml .Values.startupProbe) $ | nindent 12 }}
|
||||
resources:
|
||||
{{- toYaml .Values.resources | nindent 12 }}
|
||||
{{- if or .Values.customRunConfig .Values.volumeMounts }}
|
||||
volumeMounts:
|
||||
- name: config-volume
|
||||
mountPath: /config
|
||||
{{- with .Values.volumeMounts }}
|
||||
{{- toYaml . | nindent 12 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if or .Values.customRunConfig .Values.volumes }}
|
||||
volumes:
|
||||
- name: config-volume
|
||||
configMap:
|
||||
name: {{ include "llama-stack.fullname" . }}-run-config
|
||||
defaultMode: 0755
|
||||
{{- with .Values.volumes }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- with .Values.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
# yamlConfig: "/config/run.yaml"
|
||||
|
||||
# When set to true use the `run.yaml` file in the `files/run.yaml` directory
|
||||
customRunConfig: false
|
||||
|
||||
# TODO: Currently we are only working for vLLM this should be expanded in the future
|
||||
vllm:
|
||||
|
@ -69,6 +71,19 @@ service:
|
|||
# This sets the ports more information can be found here: https://kubernetes.io/docs/concepts/services-networking/service/#field-spec-ports
|
||||
port: 5001
|
||||
|
||||
# Additional volumes on the output Deployment definition.
|
||||
volumes: []
|
||||
# - name: foo
|
||||
# secret:
|
||||
# secretName: mysecret
|
||||
# optional: false
|
||||
|
||||
# Additional volumeMounts on the output Deployment definition.
|
||||
volumeMounts: []
|
||||
# - name: foo
|
||||
# mountPath: "/etc/foo"
|
||||
# readOnly: true
|
||||
|
||||
|
||||
# This block is for setting up the ingress for more information can be found here: https://kubernetes.io/docs/concepts/services-networking/ingress/
|
||||
ingress:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue