demo

2025-10-22 16:23:08 +00:00 · 2025-08-21 13:42:29 -07:00 · 2025-08-21 13:42:29 -07:00 · 2326f0166d
commit 2326f0166d
parent edd57785a1
4 changed files with 9 additions and 7 deletions
--- a/docs/source/distributions/k8s/apply.sh
+++ b/docs/source/distributions/k8s/apply.sh
@ -12,7 +12,7 @@ export POSTGRES_USER=llamastack
 export POSTGRES_DB=llamastack
 export POSTGRES_PASSWORD=llamastack

-export INFERENCE_MODEL=meta-llama/Llama-3.3-70B-Instruct
+export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
 export CODE_MODEL=bigcode/starcoder2-7b
 export OLLAMA_MODEL=llama-guard3:1b
 # Set USE_EBS to false if you don't have permission to use EKS EBS
--- a/docs/source/distributions/k8s/stack-k8s.yaml.template
+++ b/docs/source/distributions/k8s/stack-k8s.yaml.template
@ -60,11 +60,11 @@ spec:
        resources:
          requests:
            memory: "2Gi"
-            cpu: "8000m"
+            cpu: "4000m"
            ephemeral-storage: "6Gi"
          limits:
            memory: "2Gi"
-            cpu: "8000m"
+            cpu: "4000m"
            ephemeral-storage: "6Gi"
        env:
        - name: ENABLE_CHROMADB
@ -106,7 +106,7 @@ spec:
            apt-get update && apt-get install -y git
            # Clone the repository
            git clone https://github.com/meta-llama/llama-stack.git /app
-            git checkout k8s_demo
+            git checkout 7f83433

            cd /app/llama_stack/
            # Install llama-stack
--- a/docs/source/distributions/k8s/ui-k8s.yaml.template
+++ b/docs/source/distributions/k8s/ui-k8s.yaml.template
@ -37,6 +37,8 @@ spec:
            git clone https://github.com/meta-llama/llama-stack.git /app
            git checkout k8s_demo
            # Navigate to the playground directory
+            cd /app
+            pip install -e .
            cd /app/llama_stack/distribution/ui

            # Install requirements
--- a/docs/source/distributions/k8s/vllm-k8s.yaml.template
+++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template
@ -39,7 +39,7 @@ spec:
        image: vllm/vllm-openai:latest
        command: ["/bin/sh", "-c"]
        args:
-        - "vllm serve ${INFERENCE_MODEL} --enforce-eager -tp 4 --max-model-len 80000 --gpu-memory-utilization 0.92 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 1 --port 8001"
+        - "vllm serve ${INFERENCE_MODEL} --enforce-eager -tp 1 --max-model-len 80000 --gpu-memory-utilization 0.92 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 1 --port 8001"
        env:
        - name: NCCL_DEBUG
          value: "INFO"
@ -55,9 +55,9 @@ spec:
            name: http
        resources:
          limits:
-            nvidia.com/gpu: 4
+            nvidia.com/gpu: 1
          requests:
-            nvidia.com/gpu: 4
+            nvidia.com/gpu: 1
        volumeMounts:
          - name: llama-storage
            mountPath: /root/.cache/huggingface