refactor(install): simplify demo to two‑container flow, drop host‑level installs

This commit is contained in:
reluctantfuturist 2025-04-16 12:05:58 -07:00
parent 6d5d1480c9
commit e70c4e67de

View file

@ -1,401 +1,69 @@
#!/usr/bin/env bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
set -e
set -Eeuo pipefail
# Color codes for output formatting
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
RED='\033[0;31m'
NC='\033[0m' # No Color
BOLD='\033[1m'
PORT=8321
OLLAMA_PORT=11434
MODEL_ALIAS="llama3.2:1b"
SERVER_IMAGE="llamastack/distribution-ollama:0.2.2"
WAIT_TIMEOUT=300
# Default values
PORT=5001
INFERENCE_MODEL="meta-llama/Llama-3.2-3B-Instruct"
OLLAMA_MODEL_ALIAS="llama3.2:3b-instruct-fp16"
OLLAMA_URL="http://localhost:11434"
CONTAINER_ENGINE=""
log(){ printf "\e[1;32m%s\e[0m\n" "$*"; }
die(){ printf "\e[1;31m❌ %s\e[0m\n" "$*" >&2; exit 1; }
# Functions
command -v docker >/dev/null || die "Docker is required but not found."
print_banner() {
echo -e "${BOLD}==================================================${NC}"
echo -e "${BOLD} Llama Stack Ollama Distribution Setup ${NC}"
echo -e "${BOLD}==================================================${NC}"
}
# clean up any leftovers from earlier runs
for name in ollama-server llama-stack; do
docker ps -aq --filter "name=^${name}$" | xargs -r docker rm -f
done
check_command() {
command -v "$1" &> /dev/null
}
###############################################################################
# 1. Ollama
###############################################################################
log "🦙 Starting Ollama…"
docker run -d --name ollama-server -p "$OLLAMA_PORT:11434" \
-v ollama-models:/root/.ollama \
ollama/ollama >/dev/null
# Function to check prerequisites
check_prerequisites() {
echo -e "\n${BOLD}Checking prerequisites...${NC}"
log "⏳ Waiting for Ollama daemon…"
timeout "$WAIT_TIMEOUT" bash -c \
"until curl -fsS http://localhost:${OLLAMA_PORT}/ 2>/dev/null | grep -q 'Ollama'; do sleep 1; done" \
|| die "Ollama did not become ready in ${WAIT_TIMEOUT}s"
# Check for container engine (Docker or Podman)
if check_command docker; then
echo -e "${GREEN}${NC} Docker is installed"
CONTAINER_ENGINE="docker"
elif check_command podman; then
echo -e "${GREEN}${NC} Podman is installed"
CONTAINER_ENGINE="podman"
else
echo -e "${RED}Error: Neither Docker nor Podman is installed. Please install one of them first.${NC}"
echo "Visit https://docs.docker.com/get-docker/ or https://podman.io/getting-started/installation for installation instructions."
exit 1
fi
if ! docker exec ollama-server ollama list | grep -q "$MODEL_ALIAS"; then
log "📦 Pulling model $MODEL_ALIAS"
docker exec ollama-server ollama pull "$MODEL_ALIAS"
fi
# Check Python and pip
if check_command python3; then
PYTHON_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[:2])))')
if [[ $(echo "$PYTHON_VERSION >= 3.10" | bc -l) -eq 1 ]]; then
echo -e "${GREEN}${NC} Python $PYTHON_VERSION is installed"
HAS_PYTHON=true
else
echo -e "${YELLOW}Warning: Python $PYTHON_VERSION detected. Python 3.10+ recommended.${NC}"
HAS_PYTHON=false
fi
else
echo -e "${YELLOW}Warning: Python 3 is not found. Will use container for operations.${NC}"
HAS_PYTHON=false
fi
log "🚀 Launching model runtime…"
docker exec -d ollama-server ollama run "$MODEL_ALIAS" --keepalive 60m
# Check pip
if [ "$HAS_PYTHON" = true ]; then
if check_command pip || check_command pip3; then
echo -e "${GREEN}${NC} pip is installed"
HAS_PIP=true
else
echo -e "${YELLOW}Warning: pip is not found. Will use container for operations.${NC}"
HAS_PIP=false
HAS_PYTHON=false
fi
fi
}
###############################################################################
# 2. LlamaStack
###############################################################################
log "🦙📦 Starting LlamaStack…"
docker run -d --name llama-stack \
-p "$PORT:$PORT" \
--add-host=host.docker.internal:host-gateway \
"$SERVER_IMAGE" \
--port "$PORT" \
--env INFERENCE_MODEL="$MODEL_ALIAS" \
--env OLLAMA_URL="http://host.docker.internal:${OLLAMA_PORT}" >/dev/null
# Function to install Ollama
install_ollama() {
echo -e "\n${BOLD}Installing Ollama...${NC}"
log "⏳ Waiting for LlamaStack API…"
timeout "$WAIT_TIMEOUT" bash -c \
"until curl -fsS http://localhost:${PORT}/v1/health 2>/dev/null | grep -q 'OK'; do sleep 1; done" \
|| die "LlamaStack did not become ready in ${WAIT_TIMEOUT}s"
if check_command ollama; then
echo -e "${GREEN}${NC} Ollama is already installed"
else
echo "Installing Ollama..."
curl -fsSL https://ollama.com/install.sh | sh
if [ $? -eq 0 ]; then
echo -e "${GREEN}${NC} Ollama installed successfully"
else
echo -e "${RED}Error: Failed to install Ollama.${NC}"
exit 1
fi
fi
}
# Function to start Ollama server
start_ollama() {
echo -e "\n${BOLD}Starting Ollama server...${NC}"
# Check if Ollama is already running
if curl -s "$OLLAMA_URL" &> /dev/null; then
echo -e "${GREEN}${NC} Ollama server is already running"
else
echo "Starting Ollama server..."
ollama serve &
# Wait for Ollama server to start
MAX_RETRIES=30
RETRY_COUNT=0
while ! curl -s "$OLLAMA_URL" &> /dev/null; do
sleep 1
RETRY_COUNT=$((RETRY_COUNT + 1))
if [ $RETRY_COUNT -ge $MAX_RETRIES ]; then
echo -e "${RED}Error: Ollama server failed to start after $MAX_RETRIES seconds.${NC}"
exit 1
fi
done
echo -e "${GREEN}${NC} Ollama server started successfully"
fi
}
# Function to pull models
pull_models() {
echo -e "\n${BOLD}Pulling and running Llama model in Ollama...${NC}"
# Pull model
echo "Pulling $INFERENCE_MODEL model as $OLLAMA_MODEL_ALIAS..."
ollama pull $OLLAMA_MODEL_ALIAS
if [ $? -ne 0 ]; then
echo -e "${RED}Error: Failed to pull $OLLAMA_MODEL_ALIAS model.${NC}"
exit 1
fi
# Kill any existing model processes
pkill -f "ollama run $OLLAMA_MODEL_ALIAS" || true
# Start model in background
echo "Starting inference model..."
nohup ollama run $OLLAMA_MODEL_ALIAS --keepalive 60m > /dev/null 2>&1 &
# Verify model is running by checking the Ollama API
echo "Waiting for model to start (this may take a minute)..."
MAX_RETRIES=30
RETRY_DELAY=2
# Wait for model to appear in the Ollama API
for i in $(seq 1 $MAX_RETRIES); do
echo -n "."
MODELS_RUNNING=$(curl -s "$OLLAMA_URL/api/ps" | grep -E "$OLLAMA_MODEL_ALIAS" | wc -l)
if [ "$MODELS_RUNNING" -ge 1 ]; then
echo -e "\n${GREEN}${NC} Model is running successfully"
break
fi
if [ $i -eq $MAX_RETRIES ]; then
echo -e "\n${RED}Error: Model failed to start within the expected time.${NC}"
exit 1
fi
sleep $RETRY_DELAY
done
}
# Function to set up Python environment and install llama-stack-client
setup_llama_stack_cli() {
echo -e "\n${BOLD}Setting up llama-stack environment...${NC}"
# Create virtual environment
echo "Creating Python virtual environment..."
VENV_DIR="$HOME/.venv/llama-stack"
if [ -d "$VENV_DIR" ]; then
echo "Virtual environment already exists at $VENV_DIR"
else
python3 -m venv "$VENV_DIR"
if [ $? -ne 0 ]; then
echo -e "${RED}Error: Failed to create virtual environment.${NC}"
exit 1
else
echo -e "${GREEN}${NC} Virtual environment created successfully"
fi
fi
# Activate virtual environment and install packages
source "$VENV_DIR/bin/activate"
echo "Installing llama-stack-client..."
pip install --upgrade pip
pip install llama-stack-client
if [ $? -eq 0 ]; then
echo -e "${GREEN}${NC} llama-stack-client installed successfully"
# Configure the client to point to the correct server
echo "Configuring llama-stack-client..."
llama-stack-client configure --endpoint "http://localhost:$PORT"
if [ $? -eq 0 ]; then
echo -e "${GREEN}${NC} llama-stack-client configured to use http://localhost:$PORT"
# Set environment variable for CLI use
export LLAMA_STACK_BASE_URL="http://localhost:$PORT"
# Add to shell config if it exists
if [ -f "$HOME/.bashrc" ]; then
grep -q "LLAMA_STACK_BASE_URL" "$HOME/.bashrc" || echo "export LLAMA_STACK_BASE_URL=\"http://localhost:$PORT\"" >> "$HOME/.bashrc"
elif [ -f "$HOME/.zshrc" ]; then
grep -q "LLAMA_STACK_BASE_URL" "$HOME/.zshrc" || echo "export LLAMA_STACK_BASE_URL=\"http://localhost:$PORT\"" >> "$HOME/.zshrc"
fi
else
echo -e "${YELLOW}Warning: Failed to configure llama-stack-client. You may need to run 'llama-stack-client configure --endpoint http://localhost:$PORT' manually.${NC}"
fi
else
echo -e "${RED}Error: Failed to install llama-stack-client.${NC}"
exit 1
fi
}
# Function to run a test inference
run_test_inference() {
# Run a test inference to verify everything is working
echo -e "\n${BOLD}Running test inference...${NC}"
# Show the query being sent
TEST_QUERY="hello, what model are you?"
echo -e "${BOLD}Query:${NC} \"$TEST_QUERY\""
# Send the query and capture the result
echo -e "${BOLD}Sending request...${NC}"
TEST_RESULT=$(llama-stack-client inference chat-completion --message "$TEST_QUERY" 2>&1)
# Display the full result
echo -e "\n${BOLD}Response:${NC}"
echo "$TEST_RESULT"
if [[ $? -eq 0 && "$TEST_RESULT" == *"content"* ]]; then
echo -e "\n${GREEN}${NC} Test inference successful! Response received from the model."
echo -e "${BOLD}Everything is working correctly!${NC}"
else
echo -e "\n${YELLOW}Warning: Test inference might have failed.${NC}"
echo -e "You can try running a test manually after activation:"
echo -e "${YELLOW}source $VENV_DIR/bin/activate${NC}"
echo -e "${YELLOW}llama-stack-client inference chat-completion --message \"hello, what model are you?\"${NC}"
fi
}
# Function to run the llama-stack server
run_llama_stack() {
echo -e "\n${BOLD}Starting Llama Stack server...${NC}"
mkdir -p "$HOME/.llama"
# Check if container already exists
CONTAINER_NAME="llama-stack-ollama"
CONTAINER_EXISTS=false
CONTAINER_RUNNING=false
if [ "$CONTAINER_ENGINE" = "docker" ]; then
if docker ps -a --format '{{.Names}}' | grep -q "^$CONTAINER_NAME$"; then
CONTAINER_EXISTS=true
if docker ps --format '{{.Names}}' | grep -q "^$CONTAINER_NAME$"; then
CONTAINER_RUNNING=true
fi
fi
elif [ "$CONTAINER_ENGINE" = "podman" ]; then
if podman ps -a --format '{{.Names}}' | grep -q "^$CONTAINER_NAME$"; then
CONTAINER_EXISTS=true
if podman ps --format '{{.Names}}' | grep -q "^$CONTAINER_NAME$"; then
CONTAINER_RUNNING=true
fi
fi
fi
# Handle existing container
if [ "$CONTAINER_EXISTS" = true ]; then
if [ "$CONTAINER_RUNNING" = true ]; then
echo -e "${YELLOW}Container $CONTAINER_NAME is already running${NC}"
echo -e "${GREEN}${NC} Llama Stack server is already running"
echo -e "\n${BOLD}Access Information:${NC}"
echo -e " • API URL: ${GREEN}http://localhost:$PORT${NC}"
echo -e " • Inference Model: ${GREEN}$INFERENCE_MODEL${NC}"
echo -e " • Ollama URL: ${GREEN}$OLLAMA_URL${NC}"
echo -e "\n${BOLD}Management Commands:${NC}"
echo -e " • Stop Llama Stack: ${YELLOW}${CONTAINER_ENGINE} stop $CONTAINER_NAME${NC}"
echo -e " • Start Llama Stack: ${YELLOW}${CONTAINER_ENGINE} start $CONTAINER_NAME${NC}"
echo -e " • View Logs: ${YELLOW}${CONTAINER_ENGINE} logs $CONTAINER_NAME${NC}"
echo -e " • Stop Ollama: ${YELLOW}pkill ollama${NC}"
# Run a test inference
run_test_inference
return 0
else
echo -e "${YELLOW}Container $CONTAINER_NAME exists but is not running${NC}"
if [ "$CONTAINER_ENGINE" = "docker" ]; then
echo "Removing existing container..."
docker rm $CONTAINER_NAME
elif [ "$CONTAINER_ENGINE" = "podman" ]; then
echo "Removing existing container..."
podman rm $CONTAINER_NAME
fi
fi
fi
# Set the correct host value based on container engine
if [ "$CONTAINER_ENGINE" = "docker" ]; then
if [[ "$OSTYPE" == "linux-gnu"* ]]; then
# Linux with Docker should use host network
echo "Running Llama Stack server on Linux with Docker..."
docker run -d \
--name $CONTAINER_NAME \
-p $PORT:$PORT \
-v "$HOME/.llama:/root/.llama" \
--network=host \
llamastack/distribution-ollama \
--port $PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env OLLAMA_URL=http://localhost:11434
else
# macOS/Windows with Docker should use host.docker.internal
echo "Running Llama Stack server with Docker..."
docker run -d \
--name $CONTAINER_NAME \
-p $PORT:$PORT \
-v "$HOME/.llama:/root/.llama" \
llamastack/distribution-ollama \
--port $PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env OLLAMA_URL=http://host.docker.internal:11434
fi
elif [ "$CONTAINER_ENGINE" = "podman" ]; then
# Check podman version for proper host naming
PODMAN_VERSION=$(podman --version | awk '{print $3}')
if [[ $(echo "$PODMAN_VERSION >= 4.7.0" | bc -l) -eq 1 ]]; then
HOST_NAME="host.docker.internal"
else
HOST_NAME="host.containers.internal"
fi
echo "Running Llama Stack server with Podman..."
podman run -d \
--name $CONTAINER_NAME \
-p $PORT:$PORT \
-v "$HOME/.llama:/root/.llama:Z" \
llamastack/distribution-ollama \
--port $PORT \
--env INFERENCE_MODEL=$INFERENCE_MODEL \
--env OLLAMA_URL=http://$HOST_NAME:11434
fi
if [ $? -eq 0 ]; then
echo -e "${GREEN}${NC} Llama Stack server started successfully"
echo -e "\n${BOLD}Setup Complete!${NC}"
echo -e "\n${BOLD}Access Information:${NC}"
echo -e " • API URL: ${GREEN}http://localhost:$PORT${NC}"
echo -e " • Inference Model: ${GREEN}$INFERENCE_MODEL${NC}"
echo -e " • Ollama URL: ${GREEN}$OLLAMA_URL${NC}"
echo -e "\n${BOLD}Management Commands:${NC}"
echo -e " • Stop Llama Stack: ${YELLOW}${CONTAINER_ENGINE} stop $CONTAINER_NAME${NC}"
echo -e " • Start Llama Stack: ${YELLOW}${CONTAINER_ENGINE} start $CONTAINER_NAME${NC}"
echo -e " • View Logs: ${YELLOW}${CONTAINER_ENGINE} logs $CONTAINER_NAME${NC}"
echo -e " • Stop Ollama: ${YELLOW}pkill ollama${NC}"
echo -e "\n${BOLD}Using Llama Stack Client:${NC}"
echo -e "1. Activate the virtual environment: ${YELLOW}source $VENV_DIR/bin/activate${NC}"
echo -e "2. Set the server URL: ${YELLOW}export LLAMA_STACK_BASE_URL=http://localhost:$PORT${NC}"
echo -e "3. Run client commands: ${YELLOW}llama-stack-client --help${NC}"
# Run a test inference
run_test_inference
else
echo -e "${RED}Error: Failed to start Llama Stack server.${NC}"
exit 1
fi
}
# Main installation flow
main() {
print_banner
check_prerequisites
install_ollama
start_ollama
pull_models
setup_llama_stack_cli
run_llama_stack
}
# Run main function
main
###############################################################################
# Done
###############################################################################
log ""
log "🎉 LlamaStack is ready!"
log "👉 API endpoint: http://localhost:${PORT}"