mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-15 06:00:48 +00:00
demo
This commit is contained in:
parent
62c758932d
commit
f02fda0bd7
12 changed files with 5521 additions and 14 deletions
|
@ -59,6 +59,7 @@ The deployment process:
|
|||
- Llama NIM (code model)
|
||||
- PostgreSQL database
|
||||
- Chroma vector database
|
||||
- Jaeger (distributed tracing)
|
||||
- Llama Stack server
|
||||
- UI service
|
||||
- Ingress configuration
|
||||
|
@ -124,7 +125,9 @@ The stack configuration is defined in `stack_run_config.yaml`. This file configu
|
|||
|
||||
If you need to modify this configuration, edit the file before running `apply.sh`.
|
||||
|
||||
## Monitoring
|
||||
## Monitoring and Telemetry
|
||||
|
||||
### Prometheus Monitoring
|
||||
|
||||
The deployment includes Prometheus monitoring capabilities:
|
||||
|
||||
|
@ -133,6 +136,28 @@ The deployment includes Prometheus monitoring capabilities:
|
|||
./install-prometheus.sh
|
||||
```
|
||||
|
||||
### Jaeger Tracing
|
||||
|
||||
The deployment includes Jaeger for distributed tracing:
|
||||
|
||||
1. **Access the Jaeger UI**:
|
||||
```bash
|
||||
kubectl port-forward svc/jaeger 16686:16686
|
||||
```
|
||||
Then open http://localhost:16686 in your browser.
|
||||
|
||||
2. **Trace Configuration**:
|
||||
- Traces are automatically sent from llama-stack to Jaeger
|
||||
- The service name is set to "llama-stack" by default
|
||||
- Traces include spans for API calls, model inference, and other operations
|
||||
|
||||
3. **Troubleshooting Traces**:
|
||||
- If traces are not appearing in Jaeger:
|
||||
- Verify Jaeger is running: `kubectl get pods | grep jaeger`
|
||||
- Check llama-stack logs: `kubectl logs -f deployment/llama-stack-server`
|
||||
- Ensure the OTLP endpoint is correctly configured in the stack configuration
|
||||
- Verify network connectivity between llama-stack and Jaeger
|
||||
|
||||
## Cleanup
|
||||
|
||||
To remove all deployed resources:
|
||||
|
@ -176,6 +201,12 @@ This will:
|
|||
kubectl get endpoints
|
||||
```
|
||||
|
||||
5. **Traces not appearing in Jaeger**:
|
||||
- Check if the Jaeger pod is running: `kubectl get pods | grep jaeger`
|
||||
- Verify the llama-stack server is waiting for Jaeger to be ready before starting
|
||||
- Check the telemetry configuration in `stack_run_config.yaml`
|
||||
- Ensure the OTLP endpoint is correctly set to `http://jaeger.default.svc.cluster.local:4318`
|
||||
|
||||
### Viewing Logs
|
||||
|
||||
```bash
|
||||
|
@ -183,6 +214,7 @@ This will:
|
|||
kubectl logs -f deployment/llama-stack-server
|
||||
kubectl logs -f deployment/vllm-server
|
||||
kubectl logs -f deployment/llama-stack-ui
|
||||
kubectl logs -f deployment/jaeger
|
||||
```
|
||||
|
||||
## Advanced Configuration
|
||||
|
@ -194,13 +226,11 @@ You can modify the resource limits in the YAML template files before deployment:
|
|||
- `vllm-k8s.yaml.template`: vLLM server resources
|
||||
- `stack-k8s.yaml.template`: Llama Stack server resources
|
||||
- `llama-nim.yaml.template`: NIM server resources
|
||||
|
||||
|
||||
|
||||
|
||||
- `jaeger-k8s.yaml.template`: Jaeger server resources
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- [Llama Stack Documentation](https://github.com/meta-llama/llama-stack)
|
||||
- [vLLM Documentation](https://docs.vllm.ai/)
|
||||
- [Kubernetes Documentation](https://kubernetes.io/docs/)
|
||||
- [Jaeger Tracing Documentation](https://www.jaegertracing.io/docs/)
|
||||
|
|
|
@ -12,7 +12,7 @@ export POSTGRES_USER=llamastack
|
|||
export POSTGRES_DB=llamastack
|
||||
export POSTGRES_PASSWORD=llamastack
|
||||
|
||||
export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
||||
export INFERENCE_MODEL=meta-llama/Llama-3.3-70B-Instruct
|
||||
export CODE_MODEL=bigcode/starcoder2-7b
|
||||
export OLLAMA_MODEL=llama-guard3:1b
|
||||
# Set USE_EBS to false if you don't have permission to use EKS EBS
|
||||
|
@ -80,6 +80,7 @@ if [ "$USE_EBS" = "true" ]; then
|
|||
envsubst < ./llama-nim.yaml.template | kubectl apply -f -
|
||||
envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
|
||||
envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
|
||||
envsubst < ./jaeger-k8s.yaml.template | kubectl apply -f -
|
||||
|
||||
|
||||
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
|
||||
|
@ -99,6 +100,7 @@ else
|
|||
envsubst < ./llama-nim.yaml.template | kubectl apply -f -
|
||||
envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
||||
envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
|
||||
envsubst < ./jaeger-k8s.yaml.template | kubectl apply -f -
|
||||
|
||||
kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
|
||||
--dry-run=client -o yaml > stack-configmap.yaml
|
||||
|
|
|
@ -89,6 +89,10 @@ envsubst < ./ollama-safety-k8s.yaml.template | kubectl delete -f - --ignore-not-
|
|||
echo "Deleting vllm deployment..."
|
||||
envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||
|
||||
# Delete jaeger deployment
|
||||
echo "Deleting jaeger deployment..."
|
||||
envsubst < ./jaeger-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
|
||||
|
||||
# Delete the HF token secret if it exists
|
||||
if [ -n "${HF_TOKEN:-}" ]; then
|
||||
echo "Deleting HF token secret..."
|
||||
|
@ -109,6 +113,7 @@ for template in ./*.yaml.template; do
|
|||
"$template" != "./vllm-safety-k8s.yaml.template" &&
|
||||
"$template" != "./ollama-safety-k8s.yaml.template" &&
|
||||
"$template" != "./vllm-k8s.yaml.template" &&
|
||||
"$template" != "./jaeger-k8s.yaml.template" &&
|
||||
"$template" != "./set-secret.yaml.template" &&
|
||||
"$template" != "./ui-service-k8s.yaml.template" ]]; then
|
||||
echo "Deleting resources from $template..."
|
||||
|
@ -133,6 +138,7 @@ kubectl delete service -l app=postgres --ignore-not-found=true
|
|||
kubectl delete service -l app=vllm --ignore-not-found=true
|
||||
kubectl delete service -l app=llama-nim --ignore-not-found=true
|
||||
kubectl delete service -l app.kubernetes.io/name=ollama-safety --ignore-not-found=true
|
||||
kubectl delete service -l app=jaeger --ignore-not-found=true
|
||||
|
||||
# Delete any remaining secrets
|
||||
echo "Deleting any remaining secrets..."
|
||||
|
|
|
@ -36,3 +36,17 @@ helm install prometheus prometheus-community/kube-prometheus-stack \
|
|||
echo "kube-prometheus-stack has been installed successfully!"
|
||||
echo "To access Grafana UI, run: kubectl port-forward svc/kube-prometheus-stack-1754164871-grafana 31509:80 -n prometheus"
|
||||
echo "Default Grafana credentials - Username: admin, Password: prom-operator"
|
||||
# 1. Add the official chart repo and update your cache
|
||||
helm repo add jaegertracing https://jaegertracing.github.io/helm-charts
|
||||
helm repo update
|
||||
|
||||
# 2. (Optional) Create a namespace for observability tools
|
||||
kubectl create namespace observability
|
||||
|
||||
# 3. Install Cert-Manager once per cluster (operator webhooks need it)
|
||||
kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.0/cert-manager.yaml
|
||||
|
||||
# 4. Install the operator
|
||||
helm install jaeger-operator jaegertracing/jaeger-operator \
|
||||
--namespace observability \
|
||||
--set rbac.clusterRole=true # watch the whole cluster
|
||||
|
|
10
docs/source/distributions/k8s/jaeger-k8s.yaml.template
Normal file
10
docs/source/distributions/k8s/jaeger-k8s.yaml.template
Normal file
|
@ -0,0 +1,10 @@
|
|||
# jaeger-dev.yaml
|
||||
apiVersion: jaegertracing.io/v1
|
||||
kind: Jaeger
|
||||
metadata:
|
||||
name: jaeger-dev
|
||||
namespace: observability
|
||||
spec:
|
||||
strategy: allInOne # single pod with agent, collector & query
|
||||
ingress:
|
||||
enabled: false # set true + host rules if you use Ingress
|
5425
docs/source/distributions/k8s/kube-prometheus-stack.values
Normal file
5425
docs/source/distributions/k8s/kube-prometheus-stack.values
Normal file
File diff suppressed because it is too large
Load diff
|
@ -48,7 +48,7 @@ spec:
|
|||
name: ngc-api
|
||||
key: NGC_API_KEY
|
||||
- name: NVIDIA_VISIBLE_DEVICES
|
||||
value: "all"
|
||||
value: "0"
|
||||
- name: NVIDIA_DRIVER_CAPABILITIES
|
||||
value: "compute,utility"
|
||||
- name: ENABLE_GPU_METRICS
|
||||
|
|
4
docs/source/distributions/k8s/port-foward.sh
Normal file
4
docs/source/distributions/k8s/port-foward.sh
Normal file
|
@ -0,0 +1,4 @@
|
|||
kubectl port-forward svc/llama-stack-ui-service 8322:8322 &
|
||||
kubectl port-forward svc/llama-stack-service 8321:8321 &
|
||||
kubectl port-forward svc/jaeger-dev-query 16686:16686 -n observability &
|
||||
kubectl port-forward svc/kube-prometheus-stack-1754270486-grafana 3000:3000 -n prometheus
|
|
@ -64,8 +64,10 @@ data:
|
|||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
|
||||
sinks: ${env.TELEMETRY_SINKS:=console}
|
||||
service_name: "${env.OTEL_SERVICE_NAME:=llama-stack}"
|
||||
sinks: ['console', 'otel_trace', 'otel_metric']
|
||||
otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=http://jaeger-dev-collector.observability:4318}
|
||||
otel_exporter_otlp_protocol: "http/protobuf"
|
||||
tool_runtime:
|
||||
- provider_id: brave-search
|
||||
provider_type: remote::brave-search
|
||||
|
|
|
@ -75,6 +75,10 @@ spec:
|
|||
value: http://vllm-server.default.svc.cluster.local:8001/v1
|
||||
- name: VLLM_MAX_TOKENS
|
||||
value: "3072"
|
||||
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
||||
value: http://jaeger-dev-collector.observability:4318
|
||||
- name: OTEL_SERVICE_NAME
|
||||
value: llama-stack
|
||||
- name: NVIDIA_BASE_URL
|
||||
value: http://llm-nim-code.default.svc.cluster.local:8000
|
||||
- name: OLLAMA_BASE_URL
|
||||
|
|
|
@ -61,8 +61,10 @@ providers:
|
|||
- provider_id: meta-reference
|
||||
provider_type: inline::meta-reference
|
||||
config:
|
||||
service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
|
||||
sinks: ${env.TELEMETRY_SINKS:=console}
|
||||
service_name: "${env.OTEL_SERVICE_NAME:=llama-stack}"
|
||||
sinks: ['console', 'otel_trace', 'otel_metric']
|
||||
otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=http://jaeger-dev-collector.observability:4318}
|
||||
otel_exporter_otlp_protocol: "http/protobuf"
|
||||
tool_runtime:
|
||||
- provider_id: brave-search
|
||||
provider_type: remote::brave-search
|
||||
|
|
|
@ -39,8 +39,10 @@ spec:
|
|||
image: vllm/vllm-openai:latest
|
||||
command: ["/bin/sh", "-c"]
|
||||
args:
|
||||
- "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 100000 --gpu-memory-utilization 0.9 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 2 --port 8001"
|
||||
- "vllm serve ${INFERENCE_MODEL} --enforce-eager -tp 4 --max-model-len 80000 --gpu-memory-utilization 0.92 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 1 --port 8001"
|
||||
env:
|
||||
- name: NCCL_DEBUG
|
||||
value: "INFO"
|
||||
- name: INFERENCE_MODEL
|
||||
value: "${INFERENCE_MODEL}"
|
||||
- name: HUGGING_FACE_HUB_TOKEN
|
||||
|
@ -53,13 +55,19 @@ spec:
|
|||
name: http
|
||||
resources:
|
||||
limits:
|
||||
nvidia.com/gpu: 1
|
||||
nvidia.com/gpu: 4
|
||||
requests:
|
||||
nvidia.com/gpu: 1
|
||||
nvidia.com/gpu: 4
|
||||
volumeMounts:
|
||||
- name: llama-storage
|
||||
mountPath: /root/.cache/huggingface
|
||||
- name: cache-volume
|
||||
mountPath: /dev/shm
|
||||
volumes:
|
||||
- emptyDir:
|
||||
medium: Memory
|
||||
sizeLimit: 4Gi
|
||||
name: cache-volume
|
||||
- name: llama-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: vllm-models
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue