diff --git a/docs/source/distributions/k8s/apply.sh b/docs/source/distributions/k8s/apply.sh index 4b025d20f..67754bdb7 100755 --- a/docs/source/distributions/k8s/apply.sh +++ b/docs/source/distributions/k8s/apply.sh @@ -12,7 +12,7 @@ export POSTGRES_USER=llamastack export POSTGRES_DB=llamastack export POSTGRES_PASSWORD=llamastack -export INFERENCE_MODEL=meta-llama/Llama-3.3-70B-Instruct +export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct export CODE_MODEL=bigcode/starcoder2-7b export OLLAMA_MODEL=llama-guard3:1b # Set USE_EBS to false if you don't have permission to use EKS EBS diff --git a/docs/source/distributions/k8s/stack-k8s.yaml.template b/docs/source/distributions/k8s/stack-k8s.yaml.template index 7d24a8853..4b85a2063 100644 --- a/docs/source/distributions/k8s/stack-k8s.yaml.template +++ b/docs/source/distributions/k8s/stack-k8s.yaml.template @@ -60,11 +60,11 @@ spec: resources: requests: memory: "2Gi" - cpu: "8000m" + cpu: "4000m" ephemeral-storage: "6Gi" limits: memory: "2Gi" - cpu: "8000m" + cpu: "4000m" ephemeral-storage: "6Gi" env: - name: ENABLE_CHROMADB @@ -106,7 +106,7 @@ spec: apt-get update && apt-get install -y git # Clone the repository git clone https://github.com/meta-llama/llama-stack.git /app - git checkout k8s_demo + git checkout 7f83433 cd /app/llama_stack/ # Install llama-stack diff --git a/docs/source/distributions/k8s/ui-k8s.yaml.template b/docs/source/distributions/k8s/ui-k8s.yaml.template index b92aadb4d..ad83120f8 100644 --- a/docs/source/distributions/k8s/ui-k8s.yaml.template +++ b/docs/source/distributions/k8s/ui-k8s.yaml.template @@ -37,6 +37,8 @@ spec: git clone https://github.com/meta-llama/llama-stack.git /app git checkout k8s_demo # Navigate to the playground directory + cd /app + pip install -e . cd /app/llama_stack/distribution/ui # Install requirements diff --git a/docs/source/distributions/k8s/vllm-k8s.yaml.template b/docs/source/distributions/k8s/vllm-k8s.yaml.template index 36a946d16..5bd091d8c 100644 --- a/docs/source/distributions/k8s/vllm-k8s.yaml.template +++ b/docs/source/distributions/k8s/vllm-k8s.yaml.template @@ -39,7 +39,7 @@ spec: image: vllm/vllm-openai:latest command: ["/bin/sh", "-c"] args: - - "vllm serve ${INFERENCE_MODEL} --enforce-eager -tp 4 --max-model-len 80000 --gpu-memory-utilization 0.92 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 1 --port 8001" + - "vllm serve ${INFERENCE_MODEL} --enforce-eager -tp 1 --max-model-len 80000 --gpu-memory-utilization 0.92 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 1 --port 8001" env: - name: NCCL_DEBUG value: "INFO" @@ -55,9 +55,9 @@ spec: name: http resources: limits: - nvidia.com/gpu: 4 + nvidia.com/gpu: 1 requests: - nvidia.com/gpu: 4 + nvidia.com/gpu: 1 volumeMounts: - name: llama-storage mountPath: /root/.cache/huggingface