mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-15 14:08:00 +00:00
demo
This commit is contained in:
parent
62c758932d
commit
f02fda0bd7
12 changed files with 5521 additions and 14 deletions
|
@ -59,6 +59,7 @@ The deployment process:
|
||||||
- Llama NIM (code model)
|
- Llama NIM (code model)
|
||||||
- PostgreSQL database
|
- PostgreSQL database
|
||||||
- Chroma vector database
|
- Chroma vector database
|
||||||
|
- Jaeger (distributed tracing)
|
||||||
- Llama Stack server
|
- Llama Stack server
|
||||||
- UI service
|
- UI service
|
||||||
- Ingress configuration
|
- Ingress configuration
|
||||||
|
@ -124,7 +125,9 @@ The stack configuration is defined in `stack_run_config.yaml`. This file configu
|
||||||
|
|
||||||
If you need to modify this configuration, edit the file before running `apply.sh`.
|
If you need to modify this configuration, edit the file before running `apply.sh`.
|
||||||
|
|
||||||
## Monitoring
|
## Monitoring and Telemetry
|
||||||
|
|
||||||
|
### Prometheus Monitoring
|
||||||
|
|
||||||
The deployment includes Prometheus monitoring capabilities:
|
The deployment includes Prometheus monitoring capabilities:
|
||||||
|
|
||||||
|
@ -133,6 +136,28 @@ The deployment includes Prometheus monitoring capabilities:
|
||||||
./install-prometheus.sh
|
./install-prometheus.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Jaeger Tracing
|
||||||
|
|
||||||
|
The deployment includes Jaeger for distributed tracing:
|
||||||
|
|
||||||
|
1. **Access the Jaeger UI**:
|
||||||
|
```bash
|
||||||
|
kubectl port-forward svc/jaeger 16686:16686
|
||||||
|
```
|
||||||
|
Then open http://localhost:16686 in your browser.
|
||||||
|
|
||||||
|
2. **Trace Configuration**:
|
||||||
|
- Traces are automatically sent from llama-stack to Jaeger
|
||||||
|
- The service name is set to "llama-stack" by default
|
||||||
|
- Traces include spans for API calls, model inference, and other operations
|
||||||
|
|
||||||
|
3. **Troubleshooting Traces**:
|
||||||
|
- If traces are not appearing in Jaeger:
|
||||||
|
- Verify Jaeger is running: `kubectl get pods | grep jaeger`
|
||||||
|
- Check llama-stack logs: `kubectl logs -f deployment/llama-stack-server`
|
||||||
|
- Ensure the OTLP endpoint is correctly configured in the stack configuration
|
||||||
|
- Verify network connectivity between llama-stack and Jaeger
|
||||||
|
|
||||||
## Cleanup
|
## Cleanup
|
||||||
|
|
||||||
To remove all deployed resources:
|
To remove all deployed resources:
|
||||||
|
@ -176,6 +201,12 @@ This will:
|
||||||
kubectl get endpoints
|
kubectl get endpoints
|
||||||
```
|
```
|
||||||
|
|
||||||
|
5. **Traces not appearing in Jaeger**:
|
||||||
|
- Check if the Jaeger pod is running: `kubectl get pods | grep jaeger`
|
||||||
|
- Verify the llama-stack server is waiting for Jaeger to be ready before starting
|
||||||
|
- Check the telemetry configuration in `stack_run_config.yaml`
|
||||||
|
- Ensure the OTLP endpoint is correctly set to `http://jaeger.default.svc.cluster.local:4318`
|
||||||
|
|
||||||
### Viewing Logs
|
### Viewing Logs
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
@ -183,6 +214,7 @@ This will:
|
||||||
kubectl logs -f deployment/llama-stack-server
|
kubectl logs -f deployment/llama-stack-server
|
||||||
kubectl logs -f deployment/vllm-server
|
kubectl logs -f deployment/vllm-server
|
||||||
kubectl logs -f deployment/llama-stack-ui
|
kubectl logs -f deployment/llama-stack-ui
|
||||||
|
kubectl logs -f deployment/jaeger
|
||||||
```
|
```
|
||||||
|
|
||||||
## Advanced Configuration
|
## Advanced Configuration
|
||||||
|
@ -194,13 +226,11 @@ You can modify the resource limits in the YAML template files before deployment:
|
||||||
- `vllm-k8s.yaml.template`: vLLM server resources
|
- `vllm-k8s.yaml.template`: vLLM server resources
|
||||||
- `stack-k8s.yaml.template`: Llama Stack server resources
|
- `stack-k8s.yaml.template`: Llama Stack server resources
|
||||||
- `llama-nim.yaml.template`: NIM server resources
|
- `llama-nim.yaml.template`: NIM server resources
|
||||||
|
- `jaeger-k8s.yaml.template`: Jaeger server resources
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Additional Resources
|
## Additional Resources
|
||||||
|
|
||||||
- [Llama Stack Documentation](https://github.com/meta-llama/llama-stack)
|
- [Llama Stack Documentation](https://github.com/meta-llama/llama-stack)
|
||||||
- [vLLM Documentation](https://docs.vllm.ai/)
|
- [vLLM Documentation](https://docs.vllm.ai/)
|
||||||
- [Kubernetes Documentation](https://kubernetes.io/docs/)
|
- [Kubernetes Documentation](https://kubernetes.io/docs/)
|
||||||
|
- [Jaeger Tracing Documentation](https://www.jaegertracing.io/docs/)
|
||||||
|
|
|
@ -12,7 +12,7 @@ export POSTGRES_USER=llamastack
|
||||||
export POSTGRES_DB=llamastack
|
export POSTGRES_DB=llamastack
|
||||||
export POSTGRES_PASSWORD=llamastack
|
export POSTGRES_PASSWORD=llamastack
|
||||||
|
|
||||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
export INFERENCE_MODEL=meta-llama/Llama-3.3-70B-Instruct
|
||||||
export CODE_MODEL=bigcode/starcoder2-7b
|
export CODE_MODEL=bigcode/starcoder2-7b
|
||||||
export OLLAMA_MODEL=llama-guard3:1b
|
export OLLAMA_MODEL=llama-guard3:1b
|
||||||
# Set USE_EBS to false if you don't have permission to use EKS EBS
|
# Set USE_EBS to false if you don't have permission to use EKS EBS
|
||||||
|
@ -80,6 +80,7 @@ if [ "$USE_EBS" = "true" ]; then
|
||||||
envsubst < ./llama-nim.yaml.template | kubectl apply -f -
|
envsubst < ./llama-nim.yaml.template | kubectl apply -f -
|
||||||
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
|
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
|
||||||
envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
|
envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
|
||||||
|
envsubst < ./jaeger-k8s.yaml.template | kubectl apply -f -
|
||||||
|
|
||||||
|
|
||||||
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
|
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
|
||||||
|
@ -99,6 +100,7 @@ else
|
||||||
envsubst < ./llama-nim.yaml.template | kubectl apply -f -
|
envsubst < ./llama-nim.yaml.template | kubectl apply -f -
|
||||||
envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
||||||
envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
||||||
|
envsubst < ./jaeger-k8s.yaml.template | kubectl apply -f -
|
||||||
|
|
||||||
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
|
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
|
||||||
--dry-run=client -o yaml > stack-configmap.yaml
|
--dry-run=client -o yaml > stack-configmap.yaml
|
||||||
|
|
|
@ -89,6 +89,10 @@ envsubst < ./ollama-safety-k8s.yaml.template | kubectl delete -f - --ignore-not-
|
||||||
echo "Deleting vllm deployment..."
|
echo "Deleting vllm deployment..."
|
||||||
envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
|
# Delete jaeger deployment
|
||||||
|
echo "Deleting jaeger deployment..."
|
||||||
|
envsubst < ./jaeger-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||||
|
|
||||||
# Delete the HF token secret if it exists
|
# Delete the HF token secret if it exists
|
||||||
if [ -n "${HF_TOKEN:-}" ]; then
|
if [ -n "${HF_TOKEN:-}" ]; then
|
||||||
echo "Deleting HF token secret..."
|
echo "Deleting HF token secret..."
|
||||||
|
@ -109,6 +113,7 @@ for template in ./*.yaml.template; do
|
||||||
"$template" != "./vllm-safety-k8s.yaml.template" &&
|
"$template" != "./vllm-safety-k8s.yaml.template" &&
|
||||||
"$template" != "./ollama-safety-k8s.yaml.template" &&
|
"$template" != "./ollama-safety-k8s.yaml.template" &&
|
||||||
"$template" != "./vllm-k8s.yaml.template" &&
|
"$template" != "./vllm-k8s.yaml.template" &&
|
||||||
|
"$template" != "./jaeger-k8s.yaml.template" &&
|
||||||
"$template" != "./set-secret.yaml.template" &&
|
"$template" != "./set-secret.yaml.template" &&
|
||||||
"$template" != "./ui-service-k8s.yaml.template" ]]; then
|
"$template" != "./ui-service-k8s.yaml.template" ]]; then
|
||||||
echo "Deleting resources from $template..."
|
echo "Deleting resources from $template..."
|
||||||
|
@ -133,6 +138,7 @@ kubectl delete service -l app=postgres --ignore-not-found=true
|
||||||
kubectl delete service -l app=vllm --ignore-not-found=true
|
kubectl delete service -l app=vllm --ignore-not-found=true
|
||||||
kubectl delete service -l app=llama-nim --ignore-not-found=true
|
kubectl delete service -l app=llama-nim --ignore-not-found=true
|
||||||
kubectl delete service -l app.kubernetes.io/name=ollama-safety --ignore-not-found=true
|
kubectl delete service -l app.kubernetes.io/name=ollama-safety --ignore-not-found=true
|
||||||
|
kubectl delete service -l app=jaeger --ignore-not-found=true
|
||||||
|
|
||||||
# Delete any remaining secrets
|
# Delete any remaining secrets
|
||||||
echo "Deleting any remaining secrets..."
|
echo "Deleting any remaining secrets..."
|
||||||
|
|
|
@ -36,3 +36,17 @@ helm install prometheus prometheus-community/kube-prometheus-stack \
|
||||||
echo "kube-prometheus-stack has been installed successfully!"
|
echo "kube-prometheus-stack has been installed successfully!"
|
||||||
echo "To access Grafana UI, run: kubectl port-forward svc/kube-prometheus-stack-1754164871-grafana 31509:80 -n prometheus"
|
echo "To access Grafana UI, run: kubectl port-forward svc/kube-prometheus-stack-1754164871-grafana 31509:80 -n prometheus"
|
||||||
echo "Default Grafana credentials - Username: admin, Password: prom-operator"
|
echo "Default Grafana credentials - Username: admin, Password: prom-operator"
|
||||||
|
# 1. Add the official chart repo and update your cache
|
||||||
|
helm repo add jaegertracing https://jaegertracing.github.io/helm-charts
|
||||||
|
helm repo update
|
||||||
|
|
||||||
|
# 2. (Optional) Create a namespace for observability tools
|
||||||
|
kubectl create namespace observability
|
||||||
|
|
||||||
|
# 3. Install Cert-Manager once per cluster (operator webhooks need it)
|
||||||
|
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.0/cert-manager.yaml
|
||||||
|
|
||||||
|
# 4. Install the operator
|
||||||
|
helm install jaeger-operator jaegertracing/jaeger-operator \
|
||||||
|
--namespace observability \
|
||||||
|
--set rbac.clusterRole=true # watch the whole cluster
|
||||||
|
|
10
docs/source/distributions/k8s/jaeger-k8s.yaml.template
Normal file
10
docs/source/distributions/k8s/jaeger-k8s.yaml.template
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
# jaeger-dev.yaml
|
||||||
|
apiVersion: jaegertracing.io/v1
|
||||||
|
kind: Jaeger
|
||||||
|
metadata:
|
||||||
|
name: jaeger-dev
|
||||||
|
namespace: observability
|
||||||
|
spec:
|
||||||
|
strategy: allInOne # single pod with agent, collector & query
|
||||||
|
ingress:
|
||||||
|
enabled: false # set true + host rules if you use Ingress
|
5425
docs/source/distributions/k8s/kube-prometheus-stack.values
Normal file
5425
docs/source/distributions/k8s/kube-prometheus-stack.values
Normal file
File diff suppressed because it is too large
Load diff
|
@ -48,7 +48,7 @@ spec:
|
||||||
name: ngc-api
|
name: ngc-api
|
||||||
key: NGC_API_KEY
|
key: NGC_API_KEY
|
||||||
- name: NVIDIA_VISIBLE_DEVICES
|
- name: NVIDIA_VISIBLE_DEVICES
|
||||||
value: "all"
|
value: "0"
|
||||||
- name: NVIDIA_DRIVER_CAPABILITIES
|
- name: NVIDIA_DRIVER_CAPABILITIES
|
||||||
value: "compute,utility"
|
value: "compute,utility"
|
||||||
- name: ENABLE_GPU_METRICS
|
- name: ENABLE_GPU_METRICS
|
||||||
|
|
4
docs/source/distributions/k8s/port-foward.sh
Normal file
4
docs/source/distributions/k8s/port-foward.sh
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
kubectl port-forward svc/llama-stack-ui-service 8322:8322 &
|
||||||
|
kubectl port-forward svc/llama-stack-service 8321:8321 &
|
||||||
|
kubectl port-forward svc/jaeger-dev-query 16686:16686 -n observability &
|
||||||
|
kubectl port-forward svc/kube-prometheus-stack-1754270486-grafana 3000:3000 -n prometheus
|
|
@ -64,8 +64,10 @@ data:
|
||||||
- provider_id: meta-reference
|
- provider_id: meta-reference
|
||||||
provider_type: inline::meta-reference
|
provider_type: inline::meta-reference
|
||||||
config:
|
config:
|
||||||
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
|
service_name: "${env.OTEL_SERVICE_NAME:=llama-stack}"
|
||||||
sinks: ${env.TELEMETRY_SINKS:=console}
|
sinks: ['console', 'otel_trace', 'otel_metric']
|
||||||
|
otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=http://jaeger-dev-collector.observability:4318}
|
||||||
|
otel_exporter_otlp_protocol: "http/protobuf"
|
||||||
tool_runtime:
|
tool_runtime:
|
||||||
- provider_id: brave-search
|
- provider_id: brave-search
|
||||||
provider_type: remote::brave-search
|
provider_type: remote::brave-search
|
||||||
|
|
|
@ -75,6 +75,10 @@ spec:
|
||||||
value: http://vllm-server.default.svc.cluster.local:8001/v1
|
value: http://vllm-server.default.svc.cluster.local:8001/v1
|
||||||
- name: VLLM_MAX_TOKENS
|
- name: VLLM_MAX_TOKENS
|
||||||
value: "3072"
|
value: "3072"
|
||||||
|
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||||
|
value: http://jaeger-dev-collector.observability:4318
|
||||||
|
- name: OTEL_SERVICE_NAME
|
||||||
|
value: llama-stack
|
||||||
- name: NVIDIA_BASE_URL
|
- name: NVIDIA_BASE_URL
|
||||||
value: http://llm-nim-code.default.svc.cluster.local:8000
|
value: http://llm-nim-code.default.svc.cluster.local:8000
|
||||||
- name: OLLAMA_BASE_URL
|
- name: OLLAMA_BASE_URL
|
||||||
|
|
|
@ -61,8 +61,10 @@ providers:
|
||||||
- provider_id: meta-reference
|
- provider_id: meta-reference
|
||||||
provider_type: inline::meta-reference
|
provider_type: inline::meta-reference
|
||||||
config:
|
config:
|
||||||
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
|
service_name: "${env.OTEL_SERVICE_NAME:=llama-stack}"
|
||||||
sinks: ${env.TELEMETRY_SINKS:=console}
|
sinks: ['console', 'otel_trace', 'otel_metric']
|
||||||
|
otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=http://jaeger-dev-collector.observability:4318}
|
||||||
|
otel_exporter_otlp_protocol: "http/protobuf"
|
||||||
tool_runtime:
|
tool_runtime:
|
||||||
- provider_id: brave-search
|
- provider_id: brave-search
|
||||||
provider_type: remote::brave-search
|
provider_type: remote::brave-search
|
||||||
|
|
|
@ -39,8 +39,10 @@ spec:
|
||||||
image: vllm/vllm-openai:latest
|
image: vllm/vllm-openai:latest
|
||||||
command: ["/bin/sh", "-c"]
|
command: ["/bin/sh", "-c"]
|
||||||
args:
|
args:
|
||||||
- "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 100000 --gpu-memory-utilization 0.9 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 2 --port 8001"
|
- "vllm serve ${INFERENCE_MODEL} --enforce-eager -tp 4 --max-model-len 80000 --gpu-memory-utilization 0.92 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 1 --port 8001"
|
||||||
env:
|
env:
|
||||||
|
- name: NCCL_DEBUG
|
||||||
|
value: "INFO"
|
||||||
- name: INFERENCE_MODEL
|
- name: INFERENCE_MODEL
|
||||||
value: "${INFERENCE_MODEL}"
|
value: "${INFERENCE_MODEL}"
|
||||||
- name: HUGGING_FACE_HUB_TOKEN
|
- name: HUGGING_FACE_HUB_TOKEN
|
||||||
|
@ -53,13 +55,19 @@ spec:
|
||||||
name: http
|
name: http
|
||||||
resources:
|
resources:
|
||||||
limits:
|
limits:
|
||||||
nvidia.com/gpu: 1
|
nvidia.com/gpu: 4
|
||||||
requests:
|
requests:
|
||||||
nvidia.com/gpu: 1
|
nvidia.com/gpu: 4
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: llama-storage
|
- name: llama-storage
|
||||||
mountPath: /root/.cache/huggingface
|
mountPath: /root/.cache/huggingface
|
||||||
|
- name: cache-volume
|
||||||
|
mountPath: /dev/shm
|
||||||
volumes:
|
volumes:
|
||||||
|
- emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
sizeLimit: 4Gi
|
||||||
|
name: cache-volume
|
||||||
- name: llama-storage
|
- name: llama-storage
|
||||||
persistentVolumeClaim:
|
persistentVolumeClaim:
|
||||||
claimName: vllm-models
|
claimName: vllm-models
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue