demo

2025-08-15 14:08:00 +00:00 · 2025-08-05 13:33:32 -07:00 · 2025-08-05 13:33:32 -07:00 · f02fda0bd7
commit f02fda0bd7
parent 62c758932d
12 changed files with 5521 additions and 14 deletions
--- a/docs/source/distributions/k8s/README.md
+++ b/docs/source/distributions/k8s/README.md
@ -59,6 +59,7 @@ The deployment process:
   - Llama NIM (code model)
   - PostgreSQL database
   - Chroma vector database
   - Jaeger (distributed tracing)
   - Llama Stack server
   - UI service
   - Ingress configuration
@ -124,7 +125,9 @@ The stack configuration is defined in `stack_run_config.yaml`. This file configu
 If you need to modify this configuration, edit the file before running `apply.sh`.
-## Monitoring
+## Monitoring and Telemetry
 ### Prometheus Monitoring
 The deployment includes Prometheus monitoring capabilities:
@ -133,6 +136,28 @@ The deployment includes Prometheus monitoring capabilities:
 ./install-prometheus.sh
 ```
 ### Jaeger Tracing
 The deployment includes Jaeger for distributed tracing:
 1. **Access the Jaeger UI**:
   ```bash
   kubectl port-forward svc/jaeger 16686:16686
   ```
   Then open http://localhost:16686 in your browser.
 2. **Trace Configuration**:
   - Traces are automatically sent from llama-stack to Jaeger
   - The service name is set to "llama-stack" by default
   - Traces include spans for API calls, model inference, and other operations
 3. **Troubleshooting Traces**:
   - If traces are not appearing in Jaeger:
     - Verify Jaeger is running: `kubectl get pods | grep jaeger`
     - Check llama-stack logs: `kubectl logs -f deployment/llama-stack-server`
     - Ensure the OTLP endpoint is correctly configured in the stack configuration
     - Verify network connectivity between llama-stack and Jaeger
 ## Cleanup
 To remove all deployed resources:
@ -176,6 +201,12 @@ This will:
     kubectl get endpoints
     ```
 5. **Traces not appearing in Jaeger**:
   - Check if the Jaeger pod is running: `kubectl get pods | grep jaeger`
   - Verify the llama-stack server is waiting for Jaeger to be ready before starting
   - Check the telemetry configuration in `stack_run_config.yaml`
   - Ensure the OTLP endpoint is correctly set to `http://jaeger.default.svc.cluster.local:4318`
 ### Viewing Logs
 ```bash
@ -183,6 +214,7 @@ This will:
 kubectl logs -f deployment/llama-stack-server
 kubectl logs -f deployment/vllm-server
 kubectl logs -f deployment/llama-stack-ui
 kubectl logs -f deployment/jaeger
 ```
 ## Advanced Configuration
@ -194,13 +226,11 @@ You can modify the resource limits in the YAML template files before deployment:
 - `vllm-k8s.yaml.template`: vLLM server resources
 - `stack-k8s.yaml.template`: Llama Stack server resources
 - `llama-nim.yaml.template`: NIM server resources
-
+- `jaeger-k8s.yaml.template`: Jaeger server resources
 ## Additional Resources
 - [Llama Stack Documentation](https://github.com/meta-llama/llama-stack)
 - [vLLM Documentation](https://docs.vllm.ai/)
 - [Kubernetes Documentation](https://kubernetes.io/docs/)
 - [Jaeger Tracing Documentation](https://www.jaegertracing.io/docs/)
--- a/docs/source/distributions/k8s/apply.sh
+++ b/docs/source/distributions/k8s/apply.sh
@ -12,7 +12,7 @@ export POSTGRES_USER=llamastack
 export POSTGRES_DB=llamastack
 export POSTGRES_PASSWORD=llamastack
-export INFERENCE_MODEL=meta-llama/Llama-3.2-3B-Instruct
+export INFERENCE_MODEL=meta-llama/Llama-3.3-70B-Instruct
 export CODE_MODEL=bigcode/starcoder2-7b
 export OLLAMA_MODEL=llama-guard3:1b
 # Set USE_EBS to false if you don't have permission to use EKS EBS
@ -80,6 +80,7 @@ if [ "$USE_EBS" = "true" ]; then
  envsubst < ./llama-nim.yaml.template | kubectl apply -f -
  envsubst < ./postgres-k8s.yaml.template | kubectl apply -f -
  envsubst < ./chroma-k8s.yaml.template | kubectl apply -f -
  envsubst < ./jaeger-k8s.yaml.template | kubectl apply -f -
  kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
@ -99,6 +100,7 @@ else
  envsubst < ./llama-nim.yaml.template | kubectl apply -f -
  envsubst < ./postgres-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
  envsubst < ./chroma-k8s.yaml.template | sed 's/persistentVolumeClaim:/emptyDir: {}/g' | sed '/claimName:/d' | kubectl apply -f -
  envsubst < ./jaeger-k8s.yaml.template | kubectl apply -f -
  kubectl create configmap llama-stack-config --from-file=stack_run_config.yaml \
    --dry-run=client -o yaml > stack-configmap.yaml
--- a/docs/source/distributions/k8s/delete.sh
+++ b/docs/source/distributions/k8s/delete.sh
@ -89,6 +89,10 @@ envsubst < ./ollama-safety-k8s.yaml.template | kubectl delete -f - --ignore-not-
 echo "Deleting vllm deployment..."
 envsubst < ./vllm-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 # Delete jaeger deployment
 echo "Deleting jaeger deployment..."
 envsubst < ./jaeger-k8s.yaml.template | kubectl delete -f - --ignore-not-found=true
 # Delete the HF token secret if it exists
 if [ -n "${HF_TOKEN:-}" ]; then
  echo "Deleting HF token secret..."
@ -109,6 +113,7 @@ for template in ./*.yaml.template; do
          "$template" != "./vllm-safety-k8s.yaml.template" &&
          "$template" != "./ollama-safety-k8s.yaml.template" &&
          "$template" != "./vllm-k8s.yaml.template" &&
          "$template" != "./jaeger-k8s.yaml.template" &&
          "$template" != "./set-secret.yaml.template" &&
          "$template" != "./ui-service-k8s.yaml.template" ]]; then
      echo "Deleting resources from $template..."
@ -133,6 +138,7 @@ kubectl delete service -l app=postgres --ignore-not-found=true
 kubectl delete service -l app=vllm --ignore-not-found=true
 kubectl delete service -l app=llama-nim --ignore-not-found=true
 kubectl delete service -l app.kubernetes.io/name=ollama-safety --ignore-not-found=true
 kubectl delete service -l app=jaeger --ignore-not-found=true
 # Delete any remaining secrets
 echo "Deleting any remaining secrets..."
--- a/docs/source/distributions/k8s/install-prometheus.sh
+++ b/docs/source/distributions/k8s/install-prometheus.sh
@ -36,3 +36,17 @@ helm install prometheus prometheus-community/kube-prometheus-stack \
 echo "kube-prometheus-stack has been installed successfully!"
 echo "To access Grafana UI, run: kubectl port-forward svc/kube-prometheus-stack-1754164871-grafana 31509:80 -n prometheus"
 echo "Default Grafana credentials - Username: admin, Password: prom-operator"
 # 1. Add the official chart repo and update your cache
 helm repo add jaegertracing https://jaegertracing.github.io/helm-charts
 helm repo update
 # 2. (Optional) Create a namespace for observability tools
 kubectl create namespace observability
 # 3. Install Cert-Manager once per cluster (operator webhooks need it)
 kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.14.0/cert-manager.yaml
 # 4. Install the operator
 helm install jaeger-operator jaegertracing/jaeger-operator \
  --namespace observability \
  --set rbac.clusterRole=true        # watch the whole cluster
--- a/docs/source/distributions/k8s/jaeger-k8s.yaml.template
+++ b/docs/source/distributions/k8s/jaeger-k8s.yaml.template
@ -0,0 +1,10 @@
 # jaeger-dev.yaml
 apiVersion: jaegertracing.io/v1
 kind: Jaeger
 metadata:
  name: jaeger-dev
  namespace: observability
 spec:
  strategy: allInOne           # single pod with agent, collector & query
  ingress:
    enabled: false             # set true + host rules if you use Ingress
--- a/docs/source/distributions/k8s/kube-prometheus-stack.values
+++ b/docs/source/distributions/k8s/kube-prometheus-stack.values
--- a/docs/source/distributions/k8s/llama-nim.yaml.template
+++ b/docs/source/distributions/k8s/llama-nim.yaml.template
@ -48,7 +48,7 @@ spec:
                  name: ngc-api
                  key: NGC_API_KEY
            - name: NVIDIA_VISIBLE_DEVICES
-              value: "all"
+              value: "0"
            - name: NVIDIA_DRIVER_CAPABILITIES
              value: "compute,utility"
            - name: ENABLE_GPU_METRICS
--- a/docs/source/distributions/k8s/port-foward.sh
+++ b/docs/source/distributions/k8s/port-foward.sh
@ -0,0 +1,4 @@
 kubectl port-forward svc/llama-stack-ui-service 8322:8322 &
 kubectl port-forward svc/llama-stack-service 8321:8321 & 
 kubectl port-forward svc/jaeger-dev-query 16686:16686 -n observability & 
 kubectl port-forward svc/kube-prometheus-stack-1754270486-grafana 3000:3000 -n prometheus
--- a/docs/source/distributions/k8s/stack-configmap.yaml
+++ b/docs/source/distributions/k8s/stack-configmap.yaml
@ -64,8 +64,10 @@ data:
      - provider_id: meta-reference
        provider_type: inline::meta-reference
        config:
-          service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
+          service_name: "${env.OTEL_SERVICE_NAME:=llama-stack}"
-          sinks: ${env.TELEMETRY_SINKS:=console}
+          sinks: ['console', 'otel_trace', 'otel_metric']
          otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=http://jaeger-dev-collector.observability:4318}
          otel_exporter_otlp_protocol: "http/protobuf"
      tool_runtime:
      - provider_id: brave-search
        provider_type: remote::brave-search
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@ -75,6 +75,10 @@ spec:
          value: http://vllm-server.default.svc.cluster.local:8001/v1
        - name: VLLM_MAX_TOKENS
          value: "3072"
        - name: OTEL_EXPORTER_OTLP_ENDPOINT
          value: http://jaeger-dev-collector.observability:4318
        - name: OTEL_SERVICE_NAME
          value: llama-stack
        - name: NVIDIA_BASE_URL
          value: http://llm-nim-code.default.svc.cluster.local:8000
        - name: OLLAMA_BASE_URL
--- a/docs/source/distributions/k8s/stack_run_config.yaml
+++ b/docs/source/distributions/k8s/stack_run_config.yaml
@ -61,8 +61,10 @@ providers:
  - provider_id: meta-reference
    provider_type: inline::meta-reference
    config:
-      service_name: "${env.OTEL_SERVICE_NAME:=\u200B}"
+      service_name: "${env.OTEL_SERVICE_NAME:=llama-stack}"
-      sinks: ${env.TELEMETRY_SINKS:=console}
+      sinks: ['console', 'otel_trace', 'otel_metric']
      otel_exporter_otlp_endpoint: ${env.OTEL_EXPORTER_OTLP_ENDPOINT:=http://jaeger-dev-collector.observability:4318}
      otel_exporter_otlp_protocol: "http/protobuf"
  tool_runtime:
  - provider_id: brave-search
    provider_type: remote::brave-search
--- a/docs/source/distributions/k8s/vllm-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template
@ -39,8 +39,10 @@ spec:
        image: vllm/vllm-openai:latest
        command: ["/bin/sh", "-c"]
        args:
-        - "vllm serve ${INFERENCE_MODEL} --enforce-eager --max-model-len 100000 --gpu-memory-utilization 0.9 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 2 --port 8001"
+        - "vllm serve ${INFERENCE_MODEL} --enforce-eager -tp 4 --max-model-len 80000 --gpu-memory-utilization 0.92 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 1 --port 8001"
        env:
        - name: NCCL_DEBUG
          value: "INFO"
        - name: INFERENCE_MODEL
          value: "${INFERENCE_MODEL}"
        - name: HUGGING_FACE_HUB_TOKEN
@ -53,13 +55,19 @@ spec:
            name: http
        resources:
          limits:
-            nvidia.com/gpu: 1
+            nvidia.com/gpu: 4
          requests:
-            nvidia.com/gpu: 1
+            nvidia.com/gpu: 4
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.cache/huggingface
          - name: cache-volume
            mountPath: /dev/shm
      volumes:
      - emptyDir:
          medium: Memory
          sizeLimit: 4Gi
        name: cache-volume
      - name: llama-storage
        persistentVolumeClaim:
          claimName: vllm-models