This commit is contained in:
Kai Wu 2025-08-05 13:33:32 -07:00
parent 62c758932d
commit f02fda0bd7
12 changed files with 5521 additions and 14 deletions

View file

@ -59,6 +59,7 @@ The deployment process:
- Llama NIM (code model) - Llama NIM (code model)
- PostgreSQL database - PostgreSQL database
- Chroma vector database - Chroma vector database
- Jaeger (distributed tracing)
- Llama Stack server - Llama Stack server
- UI service - UI service
- Ingress configuration - Ingress configuration
@ -124,7 +125,9 @@ The stack configuration is defined in `stack_run_config.yaml`. This file configu
If you need to modify this configuration, edit the file before running `apply.sh`. If you need to modify this configuration, edit the file before running `apply.sh`.
## Monitoring ## Monitoring and Telemetry
### Prometheus Monitoring
The deployment includes Prometheus monitoring capabilities: The deployment includes Prometheus monitoring capabilities:
@ -133,6 +136,28 @@ The deployment includes Prometheus monitoring capabilities:
./install-prometheus.sh ./install-prometheus.sh
``` ```
### Jaeger Tracing
The deployment includes Jaeger for distributed tracing:
1. **Access the Jaeger UI**:
```bash
kubectl port-forward svc/jaeger 16686:16686
```
Then open http://localhost:16686 in your browser.
2. **Trace Configuration**:
- Traces are automatically sent from llama-stack to Jaeger
- The service name is set to "llama-stack" by default
- Traces include spans for API calls, model inference, and other operations
3. **Troubleshooting Traces**:
- If traces are not appearing in Jaeger:
- Verify Jaeger is running: `kubectl get pods | grep jaeger`
- Check llama-stack logs: `kubectl logs -f deployment/llama-stack-server`
- Ensure the OTLP endpoint is correctly configured in the stack configuration
- Verify network connectivity between llama-stack and Jaeger
## Cleanup ## Cleanup
To remove all deployed resources: To remove all deployed resources:
@ -176,6 +201,12 @@ This will:
kubectl get endpoints kubectl get endpoints
``` ```
5. **Traces not appearing in Jaeger**:
- Check if the Jaeger pod is running: `kubectl get pods | grep jaeger`
- Verify the llama-stack server is waiting for Jaeger to be ready before starting
- Check the telemetry configuration in `stack_run_config.yaml`
- Ensure the OTLP endpoint is correctly set to `http://jaeger.default.svc.cluster.local:4318`
### Viewing Logs ### Viewing Logs
```bash ```bash
@ -183,6 +214,7 @@ This will:
kubectl logs -f deployment/llama-stack-server kubectl logs -f deployment/llama-stack-server
kubectl logs -f deployment/vllm-server kubectl logs -f deployment/vllm-server
kubectl logs -f deployment/llama-stack-ui kubectl logs -f deployment/llama-stack-ui
kubectl logs -f deployment/jaeger
``` ```
## Advanced Configuration ## Advanced Configuration
@ -194,13 +226,11 @@ You can modify the resource limits in the YAML template files before deployment:
- `vllm-k8s.yaml.template`: vLLM server resources - `vllm-k8s.yaml.template`: vLLM server resources
- `stack-k8s.yaml.template`: Llama Stack server resources - `stack-k8s.yaml.template`: Llama Stack server resources
- `llama-nim.yaml.template`: NIM server resources - `llama-nim.yaml.template`: NIM server resources
- `jaeger-k8s.yaml.template`: Jaeger server resources
## Additional Resources ## Additional Resources
- [Llama Stack Documentation](https://github.com/meta-llama/llama-stack) - [Llama Stack Documentation](https://github.com/meta-llama/llama-stack)
- [vLLM Documentation](https://docs.vllm.ai/) - [vLLM Documentation](https://docs.vllm.ai/)
- [Kubernetes Documentation](https://kubernetes.io/docs/) - [Kubernetes Documentation](https://kubernetes.io/docs/)
- [Jaeger Tracing Documentation](https://www.jaegertracing.io/docs/)

View file

@ -12,7 +12,7 @@ export POSTGRES_USER=llamastack
export POSTGRES_DB=llamastack export POSTGRES_DB=llamastack
export POSTGRES_PASSWORD=llamastack export POSTGRES_PASSWORD=llamastack
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct export INFERENCE_MODEL=meta-llama/Llama-3.3-70B-Instruct
export CODE_MODEL=bigcode/starcoder2-7b export CODE_MODEL=bigcode/starcoder2-7b
export OLLAMA_MODEL=llama-guard3:1b export OLLAMA_MODEL=llama-guard3:1b
# Set USE_EBS to false if you don't have permission to use EKS EBS # Set USE_EBS to false if you don't have permission to use EKS EBS
@ -80,6 +80,7 @@ if [ "$USE_EBS" = "true" ]; then
envsubst < ./llama-nim.yaml.template | kubectl apply -f - envsubst < ./llama-nim.yaml.template | kubectl apply -f -
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f - envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
envsubst < ./chroma-k8s.yaml.template | kubectl apply -f - envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
envsubst < ./jaeger-k8s.yaml.template | kubectl apply -f -
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \ kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
@ -99,6 +100,7 @@ else
envsubst < ./llama-nim.yaml.template | kubectl apply -f - envsubst < ./llama-nim.yaml.template | kubectl apply -f -
envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f - envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
envsubst < ./jaeger-k8s.yaml.template | kubectl apply -f -
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \ kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
--dry-run=client -o yaml > stack-configmap.yaml --dry-run=client -o yaml > stack-configmap.yaml

View file

@ -89,6 +89,10 @@ envsubst < ./ollama-safety-k8s.yaml.template | kubectl delete -f - --ignore-not-
echo "Deleting vllm deployment..." echo "Deleting vllm deployment..."
envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
# Delete jaeger deployment
echo "Deleting jaeger deployment..."
envsubst < ./jaeger-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
# Delete the HF token secret if it exists # Delete the HF token secret if it exists
if [ -n "${HF_TOKEN:-}" ]; then if [ -n "${HF_TOKEN:-}" ]; then
echo "Deleting HF token secret..." echo "Deleting HF token secret..."
@ -109,6 +113,7 @@ for template in ./*.yaml.template; do
"$template" != "./vllm-safety-k8s.yaml.template" && "$template" != "./vllm-safety-k8s.yaml.template" &&
"$template" != "./ollama-safety-k8s.yaml.template" && "$template" != "./ollama-safety-k8s.yaml.template" &&
"$template" != "./vllm-k8s.yaml.template" && "$template" != "./vllm-k8s.yaml.template" &&
"$template" != "./jaeger-k8s.yaml.template" &&
"$template" != "./set-secret.yaml.template" && "$template" != "./set-secret.yaml.template" &&
"$template" != "./ui-service-k8s.yaml.template" ]]; then "$template" != "./ui-service-k8s.yaml.template" ]]; then
echo "Deleting resources from $template..." echo "Deleting resources from $template..."
@ -133,6 +138,7 @@ kubectl delete service -l app=postgres --ignore-not-found=true
kubectl delete service -l app=vllm --ignore-not-found=true kubectl delete service -l app=vllm --ignore-not-found=true
kubectl delete service -l app=llama-nim --ignore-not-found=true kubectl delete service -l app=llama-nim --ignore-not-found=true
kubectl delete service -l app.kubernetes.io/name=ollama-safety --ignore-not-found=true kubectl delete service -l app.kubernetes.io/name=ollama-safety --ignore-not-found=true
kubectl delete service -l app=jaeger --ignore-not-found=true
# Delete any remaining secrets # Delete any remaining secrets
echo "Deleting any remaining secrets..." echo "Deleting any remaining secrets..."

View file

@ -36,3 +36,17 @@ helm install prometheus prometheus-community/kube-prometheus-stack \
echo "kube-prometheus-stack has been installed successfully!" echo "kube-prometheus-stack has been installed successfully!"
echo "To access Grafana UI, run: kubectl port-forward svc/kube-prometheus-stack-1754164871-grafana 31509:80 -n prometheus" echo "To access Grafana UI, run: kubectl port-forward svc/kube-prometheus-stack-1754164871-grafana 31509:80 -n prometheus"
echo "Default Grafana credentials - Username: admin, Password: prom-operator" echo "Default Grafana credentials - Username: admin, Password: prom-operator"
# 1. Add the official chart repo and update your cache
helm repo add jaegertracing https://jaegertracing.github.io/helm-charts
helm repo update
# 2. (Optional) Create a namespace for observability tools
kubectl create namespace observability
# 3. Install Cert-Manager once per cluster (operator webhooks need it)
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.0/cert-manager.yaml
# 4. Install the operator
helm install jaeger-operator jaegertracing/jaeger-operator \
--namespace observability \
--set rbac.clusterRole=true # watch the whole cluster

View file

@ -0,0 +1,10 @@
# jaeger-dev.yaml
apiVersion: jaegertracing.io/v1
kind: Jaeger
metadata:
name: jaeger-dev
namespace: observability
spec:
strategy: allInOne # single pod with agent, collector & query
ingress:
enabled: false # set true + host rules if you use Ingress

File diff suppressed because it is too large Load diff

View file

@ -48,7 +48,7 @@ spec:
name: ngc-api name: ngc-api
key: NGC_API_KEY key: NGC_API_KEY
- name: NVIDIA_VISIBLE_DEVICES - name: NVIDIA_VISIBLE_DEVICES
value: "all" value: "0"
- name: NVIDIA_DRIVER_CAPABILITIES - name: NVIDIA_DRIVER_CAPABILITIES
value: "compute,utility" value: "compute,utility"
- name: ENABLE_GPU_METRICS - name: ENABLE_GPU_METRICS

View file

@ -0,0 +1,4 @@
kubectl port-forward svc/llama-stack-ui-service 8322:8322 &
kubectl port-forward svc/llama-stack-service 8321:8321 &
kubectl port-forward svc/jaeger-dev-query 16686:16686 -n observability &
kubectl port-forward svc/kube-prometheus-stack-1754270486-grafana 3000:3000 -n prometheus

View file

@ -64,8 +64,10 @@ data:
- provider_id: meta-reference - provider_id: meta-reference
provider_type: inline::meta-reference provider_type: inline::meta-reference
config: config:
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" service_name: "${env.OTEL_SERVICE_NAME:=llama-stack}"
sinks: ${env.TELEMETRY_SINKS:=console} sinks: ['console', 'otel_trace', 'otel_metric']
otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=http://jaeger-dev-collector.observability:4318}
otel_exporter_otlp_protocol: "http/protobuf"
tool_runtime: tool_runtime:
- provider_id: brave-search - provider_id: brave-search
provider_type: remote::brave-search provider_type: remote::brave-search

View file

@ -75,6 +75,10 @@ spec:
value: http://vllm-server.default.svc.cluster.local:8001/v1 value: http://vllm-server.default.svc.cluster.local:8001/v1
- name: VLLM_MAX_TOKENS - name: VLLM_MAX_TOKENS
value: "3072" value: "3072"
- name: OTEL_EXPORTER_OTLP_ENDPOINT
value: http://jaeger-dev-collector.observability:4318
- name: OTEL_SERVICE_NAME
value: llama-stack
- name: NVIDIA_BASE_URL - name: NVIDIA_BASE_URL
value: http://llm-nim-code.default.svc.cluster.local:8000 value: http://llm-nim-code.default.svc.cluster.local:8000
- name: OLLAMA_BASE_URL - name: OLLAMA_BASE_URL

View file

@ -61,8 +61,10 @@ providers:
- provider_id: meta-reference - provider_id: meta-reference
provider_type: inline::meta-reference provider_type: inline::meta-reference
config: config:
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}" service_name: "${env.OTEL_SERVICE_NAME:=llama-stack}"
sinks: ${env.TELEMETRY_SINKS:=console} sinks: ['console', 'otel_trace', 'otel_metric']
otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=http://jaeger-dev-collector.observability:4318}
otel_exporter_otlp_protocol: "http/protobuf"
tool_runtime: tool_runtime:
- provider_id: brave-search - provider_id: brave-search
provider_type: remote::brave-search provider_type: remote::brave-search

View file

@ -39,8 +39,10 @@ spec:
image: vllm/vllm-openai:latest image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"] command: ["/bin/sh", "-c"]
args: args:
- "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 100000 --gpu-memory-utilization 0.9 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 2 --port 8001" - "vllm serve ${INFERENCE_MODEL} --enforce-eager -tp 4 --max-model-len 80000 --gpu-memory-utilization 0.92 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 1 --port 8001"
env: env:
- name: NCCL_DEBUG
value: "INFO"
- name: INFERENCE_MODEL - name: INFERENCE_MODEL
value: "${INFERENCE_MODEL}" value: "${INFERENCE_MODEL}"
- name: HUGGING_FACE_HUB_TOKEN - name: HUGGING_FACE_HUB_TOKEN
@ -53,13 +55,19 @@ spec:
name: http name: http
resources: resources:
limits: limits:
nvidia.com/gpu: 1 nvidia.com/gpu: 4
requests: requests:
nvidia.com/gpu: 1 nvidia.com/gpu: 4
volumeMounts: volumeMounts:
- name: llama-storage - name: llama-storage
mountPath: /root/.cache/huggingface mountPath: /root/.cache/huggingface
- name: cache-volume
mountPath: /dev/shm
volumes: volumes:
- emptyDir:
medium: Memory
sizeLimit: 4Gi
name: cache-volume
- name: llama-storage - name: llama-storage
persistentVolumeClaim: persistentVolumeClaim:
claimName: vllm-models claimName: vllm-models