This commit is contained in:
Kai Wu 2025-08-21 13:42:29 -07:00
parent edd57785a1
commit 2326f0166d
4 changed files with 9 additions and 7 deletions

View file

@ -12,7 +12,7 @@ export POSTGRES_USER=llamastack
export POSTGRES_DB=llamastack
export POSTGRES_PASSWORD=llamastack
export INFERENCE_MODEL=meta-llama/Llama-3.3-70B-Instruct
export INFERENCE_MODEL=meta-llama/Llama-3.1-8B-Instruct
export CODE_MODEL=bigcode/starcoder2-7b
export OLLAMA_MODEL=llama-guard3:1b
# Set USE_EBS to false if you don't have permission to use EKS EBS

View file

@ -60,11 +60,11 @@ spec:
resources:
requests:
memory: "2Gi"
cpu: "8000m"
cpu: "4000m"
ephemeral-storage: "6Gi"
limits:
memory: "2Gi"
cpu: "8000m"
cpu: "4000m"
ephemeral-storage: "6Gi"
env:
- name: ENABLE_CHROMADB
@ -106,7 +106,7 @@ spec:
apt-get update && apt-get install -y git
# Clone the repository
git clone https://github.com/meta-llama/llama-stack.git /app
git checkout k8s_demo
git checkout 7f83433
cd /app/llama_stack/
# Install llama-stack

View file

@ -37,6 +37,8 @@ spec:
git clone https://github.com/meta-llama/llama-stack.git /app
git checkout k8s_demo
# Navigate to the playground directory
cd /app
pip install -e .
cd /app/llama_stack/distribution/ui
# Install requirements

View file

@ -39,7 +39,7 @@ spec:
image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"]
args:
- "vllm serve ${INFERENCE_MODEL} --enforce-eager -tp 4 --max-model-len 80000 --gpu-memory-utilization 0.92 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 1 --port 8001"
- "vllm serve ${INFERENCE_MODEL} --enforce-eager -tp 1 --max-model-len 80000 --gpu-memory-utilization 0.92 --enable-auto-tool-choice --tool-call-parser llama3_json --max-num-seqs 1 --port 8001"
env:
- name: NCCL_DEBUG
value: "INFO"
@ -55,9 +55,9 @@ spec:
name: http
resources:
limits:
nvidia.com/gpu: 4
nvidia.com/gpu: 1
requests:
nvidia.com/gpu: 4
nvidia.com/gpu: 1
volumeMounts:
- name: llama-storage
mountPath: /root/.cache/huggingface