demo

2025-10-23 08:33:09 +00:00 · 2025-08-05 13:33:32 -07:00 · 2025-08-05 13:33:32 -07:00 · f02fda0bd7
commit f02fda0bd7
parent 62c758932d
12 changed files with 5521 additions and 14 deletions
--- a/docs/source/distributions/k8s/README.md
+++ b/docs/source/distributions/k8s/README.md
@ -59,6 +59,7 @@ The deployment process:
   - Llama NIM (code model)
   - PostgreSQL database
   - Chroma vector database
+   - Jaeger (distributed tracing)
   - Llama Stack server
   - UI service
   - Ingress configuration
@ -124,7 +125,9 @@ The stack configuration is defined in `stack_run_config.yaml`. This file configu

 If you need to modify this configuration, edit the file before running `apply.sh`.

-## Monitoring
+## Monitoring and Telemetry
+
+### Prometheus Monitoring

 The deployment includes Prometheus monitoring capabilities:

@ -133,6 +136,28 @@ The deployment includes Prometheus monitoring capabilities:
 ./install-prometheus.sh
 ```

+### Jaeger Tracing
+
+The deployment includes Jaeger for distributed tracing:
+
+1. **Access the Jaeger UI**:
+   ```bash
+   kubectl port-forward svc/jaeger 16686:16686
+   ```
+   Then open http://localhost:16686 in your browser.
+
+2. **Trace Configuration**:
+   - Traces are automatically sent from llama-stack to Jaeger
+   - The service name is set to "llama-stack" by default
+   - Traces include spans for API calls, model inference, and other operations
+
+3. **Troubleshooting Traces**:
+   - If traces are not appearing in Jaeger:
+     - Verify Jaeger is running: `kubectl get pods | grep jaeger`
+     - Check llama-stack logs: `kubectl logs -f deployment/llama-stack-server`
+     - Ensure the OTLP endpoint is correctly configured in the stack configuration
+     - Verify network connectivity between llama-stack and Jaeger
+
 ## Cleanup

 To remove all deployed resources:
@ -176,6 +201,12 @@ This will:
     kubectl get endpoints
     ```

+5. **Traces not appearing in Jaeger**:
+   - Check if the Jaeger pod is running: `kubectl get pods | grep jaeger`
+   - Verify the llama-stack server is waiting for Jaeger to be ready before starting
+   - Check the telemetry configuration in `stack_run_config.yaml`
+   - Ensure the OTLP endpoint is correctly set to `http://jaeger.default.svc.cluster.local:4318`
+
 ### Viewing Logs

 ```bash
@ -183,6 +214,7 @@ This will:
 kubectl logs -f deployment/llama-stack-server
 kubectl logs -f deployment/vllm-server
 kubectl logs -f deployment/llama-stack-ui
+kubectl logs -f deployment/jaeger
 ```

 ## Advanced Configuration
@ -194,13 +226,11 @@ You can modify the resource limits in the YAML template files before deployment:
 - `vllm-k8s.yaml.template`: vLLM server resources
 - `stack-k8s.yaml.template`: Llama Stack server resources
 - `llama-nim.yaml.template`: NIM server resources
-
-
-
-
+- `jaeger-k8s.yaml.template`: Jaeger server resources

 ## Additional Resources

 - [Llama Stack Documentation](https://github.com/meta-llama/llama-stack)
 - [vLLM Documentation](https://docs.vllm.ai/)
 - [Kubernetes Documentation](https://kubernetes.io/docs/)
+- [Jaeger Tracing Documentation](https://www.jaegertracing.io/docs/)
--- a/docs/source/distributions/k8s/apply.sh
+++ b/docs/source/distributions/k8s/apply.sh
@ -12,7 +12,7 @@ export POSTGRES_USER=llamastack
 export POSTGRES_DB=llamastack
 export POSTGRES_PASSWORD=llamastack

-export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+export INFERENCE_MODEL=meta-llama/Llama-3.3-70B-Instruct
 export CODE_MODEL=bigcode/starcoder2-7b
 export OLLAMA_MODEL=llama-guard3:1b
 # Set USE_EBS to false if you don't have permission to use EKS EBS
@ -80,6 +80,7 @@ if [ "$USE_EBS" = "true" ]; then
  envsubst < ./llama-nim.yaml.template | kubectl apply -f -
  envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
  envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
+  envsubst < ./jaeger-k8s.yaml.template | kubectl apply -f -


  kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
@ -99,6 +100,7 @@ else
  envsubst < ./llama-nim.yaml.template | kubectl apply -f -
  envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
  envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
+  envsubst < ./jaeger-k8s.yaml.template | kubectl apply -f -

  kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
    --dry-run=client -o yaml > stack-configmap.yaml
--- a/docs/source/distributions/k8s/delete.sh
+++ b/docs/source/distributions/k8s/delete.sh
@ -89,6 +89,10 @@ envsubst < ./ollama-safety-k8s.yaml.template | kubectl delete -f - --ignore-not-
 echo "Deleting vllm deployment..."
 envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true

+# Delete jaeger deployment
+echo "Deleting jaeger deployment..."
+envsubst < ./jaeger-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
+
 # Delete the HF token secret if it exists
 if [ -n "${HF_TOKEN:-}" ]; then
  echo "Deleting HF token secret..."
@ -109,6 +113,7 @@ for template in ./*.yaml.template; do
          "$template" != "./vllm-safety-k8s.yaml.template" &&
          "$template" != "./ollama-safety-k8s.yaml.template" &&
          "$template" != "./vllm-k8s.yaml.template" &&
+          "$template" != "./jaeger-k8s.yaml.template" &&
          "$template" != "./set-secret.yaml.template" &&
          "$template" != "./ui-service-k8s.yaml.template" ]]; then
      echo "Deleting resources from $template..."
@ -133,6 +138,7 @@ kubectl delete service -l app=postgres --ignore-not-found=true
 kubectl delete service -l app=vllm --ignore-not-found=true
 kubectl delete service -l app=llama-nim --ignore-not-found=true
 kubectl delete service -l app.kubernetes.io/name=ollama-safety --ignore-not-found=true
+kubectl delete service -l app=jaeger --ignore-not-found=true

 # Delete any remaining secrets
 echo "Deleting any remaining secrets..."
--- a/docs/source/distributions/k8s/install-prometheus.sh
+++ b/docs/source/distributions/k8s/install-prometheus.sh
@ -36,3 +36,17 @@ helm install prometheus prometheus-community/kube-prometheus-stack \
 echo "kube-prometheus-stack has been installed successfully!"
 echo "To access Grafana UI, run: kubectl port-forward svc/kube-prometheus-stack-1754164871-grafana 31509:80 -n prometheus"
 echo "Default Grafana credentials - Username: admin, Password: prom-operator"
+# 1. Add the official chart repo and update your cache
+helm repo add jaegertracing https://jaegertracing.github.io/helm-charts
+helm repo update
+
+# 2. (Optional) Create a namespace for observability tools
+kubectl create namespace observability
+
+# 3. Install Cert-Manager once per cluster (operator webhooks need it)
+kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.0/cert-manager.yaml
+
+# 4. Install the operator
+helm install jaeger-operator jaegertracing/jaeger-operator \
+  --namespace observability \
+  --set rbac.clusterRole=true        # watch the whole cluster
--- a/docs/source/distributions/k8s/jaeger-k8s.yaml.template
+++ b/docs/source/distributions/k8s/jaeger-k8s.yaml.template
@ -0,0 +1,10 @@
+# jaeger-dev.yaml
+apiVersion: jaegertracing.io/v1
+kind: Jaeger
+metadata:
+  name: jaeger-dev
+  namespace: observability
+spec:
+  strategy: allInOne           # single pod with agent, collector & query
+  ingress:
+    enabled: false             # set true + host rules if you use Ingress
--- a/docs/source/distributions/k8s/kube-prometheus-stack.values
+++ b/docs/source/distributions/k8s/kube-prometheus-stack.values
--- a/docs/source/distributions/k8s/llama-nim.yaml.template
+++ b/docs/source/distributions/k8s/llama-nim.yaml.template
@ -48,7 +48,7 @@ spec:
                  name: ngc-api
                  key: NGC_API_KEY
            - name: NVIDIA_VISIBLE_DEVICES
-              value: "all"
+              value: "0"
            - name: NVIDIA_DRIVER_CAPABILITIES
              value: "compute,utility"
            - name: ENABLE_GPU_METRICS
--- a/docs/source/distributions/k8s/port-foward.sh
+++ b/docs/source/distributions/k8s/port-foward.sh
@ -0,0 +1,4 @@
+kubectl port-forward svc/llama-stack-ui-service 8322:8322 &
+kubectl port-forward svc/llama-stack-service 8321:8321 & 
+kubectl port-forward svc/jaeger-dev-query 16686:16686 -n observability & 
+kubectl port-forward svc/kube-prometheus-stack-1754270486-grafana 3000:3000 -n prometheus
--- a/docs/source/distributions/k8s/stack-configmap.yaml
+++ b/docs/source/distributions/k8s/stack-configmap.yaml
@ -64,8 +64,10 @@ data:
      - provider_id: meta-reference
        provider_type: inline::meta-reference
        config:
-          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-          sinks: ${env.TELEMETRY_SINKS:=console}
+          service_name: "${env.OTEL_SERVICE_NAME:=llama-stack}"
+          sinks: ['console', 'otel_trace', 'otel_metric']
+          otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=http://jaeger-dev-collector.observability:4318}
+          otel_exporter_otlp_protocol: "http/protobuf"
      tool_runtime:
      - provider_id: brave-search
        provider_type: remote::brave-search
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@ -75,6 +75,10 @@ spec:
          value: http://vllm-server.default.svc.cluster.local:8001/v1
        - name: VLLM_MAX_TOKENS
          value: "3072"
+        - name: OTEL_EXPORTER_OTLP_ENDPOINT
+          value: http://jaeger-dev-collector.observability:4318
+        - name: OTEL_SERVICE_NAME
+          value: llama-stack
        - name: NVIDIA_BASE_URL
          value: http://llm-nim-code.default.svc.cluster.local:8000
        - name: OLLAMA_BASE_URL
--- a/docs/source/distributions/k8s/stack_run_config.yaml
+++ b/docs/source/distributions/k8s/stack_run_config.yaml
@ -61,8 +61,10 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
-      sinks: ${env.TELEMETRY_SINKS:=console}
+      service_name: "${env.OTEL_SERVICE_NAME:=llama-stack}"
+      sinks: ['console', 'otel_trace', 'otel_metric']
+      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=http://jaeger-dev-collector.observability:4318}
+      otel_exporter_otlp_protocol: "http/protobuf"
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
--- a/docs/source/distributions/k8s/vllm-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template
@ -39,8 +39,10 @@ spec:
        image: vllm/vllm-openai:latest
        command: ["/bin/sh", "-c"]
        args:
-        - "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 100000 --gpu-memory-utilization 0.9 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 2 --port 8001"
+        - "vllm serve ${INFERENCE_MODEL} --enforce-eager -tp 4 --max-model-len 80000 --gpu-memory-utilization 0.92 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 1 --port 8001"
        env:
+        - name: NCCL_DEBUG
+          value: "INFO"
        - name: INFERENCE_MODEL
          value: "${INFERENCE_MODEL}"
        - name: HUGGING_FACE_HUB_TOKEN
@ -53,13 +55,19 @@ spec:
            name: http
        resources:
          limits:
-            nvidia.com/gpu: 1
+            nvidia.com/gpu: 4
          requests:
-            nvidia.com/gpu: 1
+            nvidia.com/gpu: 4
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.cache/huggingface
+          - name: cache-volume
+            mountPath: /dev/shm
      volumes:
+      - emptyDir:
+          medium: Memory
+          sizeLimit: 4Gi
+        name: cache-volume
      - name: llama-storage
        persistentVolumeClaim:
          claimName: vllm-models