#!/usr/bin/env bash # auto-install.sh — autonomous end-to-end install test for veilor-os. # # Boots a fresh ISO under QEMU, drives the gum installer via the QEMU # monitor (sendkey events), waits for anaconda to finish + reboot, SSHes # into the installed system, and runs a validation checklist. # # Usage: # ./test/auto-install.sh path/to/veilor-os-*.iso # # Expected runtime: # * boot + drive installer: ~3 min # * anaconda install (KDE): ~15-25 min (depends on mirrors + host CPU) # * reboot + SSH up: ~2 min # * validation checks: <1 min # * total: 20-30 min wall clock # # Hardcoded test inputs (do NOT edit — meant to be deterministic): # disk first /dev/vda (only disk in QEMU) # hostname "veilor" (installer hardcodes this in v0.5.4) # LUKS pw testpass1234 # admin pw adminpass1234 # locale en_GB.UTF-8 (first option, accepted with Enter) # # Outputs: # /tmp/veilor-auto-install.log — full driver log # /tmp/veilor-auto-install-NN-.png — milestone screenshots # /tmp/veilor-auto-install-final-ssh.txt — final SSH session capture # # Exit codes: # 0 = all validation checks passed # 1 = any failure (anaconda crash, SSH never up, validation failed) # 2 = preflight failure (missing tool, bad ISO arg) # # This script intentionally does not source test/run-vm.sh — it needs a # different QEMU configuration (no live cloud-init seed since we're driving # the installed-system path), and run-vm.sh `exec`s qemu, which is # incompatible with running QEMU as a backgrounded child here. set -uo pipefail # ── Constants ────────────────────────────────────────────────────────── SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" TEST_DIR="$SCRIPT_DIR" DISK="$TEST_DIR/auto-install-vm.qcow2" NVRAM="$TEST_DIR/auto-install-vm.nvram" MONITOR_SOCK="$TEST_DIR/auto-install-vm.monitor.sock" SERIAL_LOG="$TEST_DIR/auto-install-vm.serial.log" SEED_ISO="$TEST_DIR/auto-install-seed.iso" LOG=/tmp/veilor-auto-install.log SHOT_PREFIX=/tmp/veilor-auto-install SSH_PORT=2222 SSH_USER=admin LUKS_PW="testpass1234" ADMIN_PW="adminpass1234" # Disk: 40G is enough headroom — KDE base + 8G LUKS + LVM overhead fits in # ~12G actual, but qcow2 only allocates blocks that get touched. DISK_SIZE=40G # OVMF firmware paths — Fedora layout. Caller can override if needed. OVMF_CODE="${OVMF_CODE:-/usr/share/edk2/ovmf/OVMF_CODE.fd}" OVMF_VARS_SRC="${OVMF_VARS_SRC:-/usr/share/edk2/ovmf/OVMF_VARS.fd}" # Timing knobs — coarse but deliberate. Tighten only after observing slack # on a real run. WAIT_MONITOR_S=120 # qemu boot to monitor socket alive WAIT_INSTALLER_BANNER_S=180 # ISO boot → tty1 gum menu visible WAIT_GUM_PROMPT_S=8 # gum draws each prompt within ~5s WAIT_AFTER_INPUT_S=3 # let UI advance after we hit Enter ANACONDA_TIMEOUT_S=2700 # 45 min — anaconda + reboot + SSH come-up ANACONDA_POLL_S=30 # screenshot/poll cadence during install # ── Logging ──────────────────────────────────────────────────────────── : > "$LOG" log() { printf '[%s] %s\n' "$(date +'%H:%M:%S')" "$*" | tee -a "$LOG"; } fail() { log "FAIL: $*"; exit 1; } # Source the keymap helper. # shellcheck source=auto-install-keymap.sh . "$SCRIPT_DIR/auto-install-keymap.sh" # ── Preflight ────────────────────────────────────────────────────────── preflight() { log "preflight: checking environment" ISO="${1:-}" if [[ -z $ISO ]]; then # Auto-fetch from ci-latest GH release if no path given. ISO is split # into chunks (GH release 2 GiB asset cap). Reassemble before boot. log "no ISO path given — fetching from gh release ci-latest" local dl_dir="$HOME/veilor-iso/ci-latest" mkdir -p "$dl_dir" ( cd "$dl_dir" && rm -f *.part-* *.iso *.sha256 && \ gh release download ci-latest --repo veilor-org/veilor-os \ --pattern '*.iso.part-*' --pattern '*.parts.sha256' --clobber ) || { echo "[ERR] gh release download failed — is the ci-latest release populated?" >&2 exit 2 } ( cd "$dl_dir" && \ local stem stem=$(ls *.part-00 2>/dev/null | head -1 | sed 's/\.part-00$//') [ -n "$stem" ] || { echo "[ERR] no .part-00 in download"; exit 2; } log "reassembling $stem from $(ls "$stem".part-* | wc -l) parts" cat "$stem".part-* > "$stem" sha256sum -c *.parts.sha256 || { echo "[ERR] reassembly checksum mismatch"; exit 2; } ) || exit 2 ISO=$(ls "$dl_dir"/*.iso 2>/dev/null | head -1) [ -n "$ISO" ] || { echo "[ERR] no iso after reassembly"; exit 2; } fi if [[ ! -f $ISO ]]; then echo "[ERR] ISO not found: $ISO" >&2 exit 2 fi km_require_tools || exit 2 for t in ssh ssh-keygen pgrep pkill; do command -v "$t" >/dev/null 2>&1 || { echo "[ERR] missing $t" >&2; exit 2; } done if [[ ! -f $OVMF_CODE ]]; then echo "[ERR] OVMF firmware missing: $OVMF_CODE (install edk2-ovmf)" >&2 exit 2 fi log "preflight: ISO=$ISO" } # ── VM lifecycle ─────────────────────────────────────────────────────── # Kill any QEMU we previously started + scrub state files. Idempotent. kill_existing_vm() { log "killing any existing auto-install QEMU" if [[ -n "${QEMU_PID:-}" ]] && kill -0 "$QEMU_PID" 2>/dev/null; then kill "$QEMU_PID" 2>/dev/null || true sleep 2 kill -9 "$QEMU_PID" 2>/dev/null || true fi # Catch orphans from prior runs — match by disk path so we don't kill # the user's other QEMU VMs. pkill -f "qemu-system-x86_64.*$DISK" 2>/dev/null || true rm -f "$MONITOR_SOCK" "$SERIAL_LOG" } # Wipe disk + nvram so each run is reproducible. wipe_state() { log "wiping qcow2 + nvram" rm -f "$DISK" "$NVRAM" "$SEED_ISO" qemu-img create -f qcow2 "$DISK" "$DISK_SIZE" >/dev/null cp "$OVMF_VARS_SRC" "$NVRAM" } # Build a NoCloud cloud-init seed ISO so anaconda's installed system picks # up our SSH pubkey on first boot. The installer-generated ks doesn't # explicitly invoke cloud-init, but Fedora ships cloud-init enabled by # default in @core; if a cidata seed is present at boot, NoCloud datasource # fires and we get key injection for free. build_seed_iso() { local pubkey="" found="" for cand in "$HOME/.ssh/id_ed25519.pub" "$HOME/.ssh/id_rsa.pub"; do if [[ -f $cand ]]; then pubkey="$(< "$cand")" found=$cand break fi done if [[ -z $pubkey ]]; then log "seed: no host SSH pubkey found at ~/.ssh/id_{ed25519,rsa}.pub" log "seed: generating throwaway test key" local key=$TEST_DIR/auto-install-id_ed25519 rm -f "$key" "$key.pub" ssh-keygen -t ed25519 -N '' -f "$key" -C "veilor-auto-install" >/dev/null pubkey="$(< "$key.pub")" TEST_KEY="$key" else log "seed: using $found" # Match host id; assume corresponding private key exists alongside. TEST_KEY="${found%.pub}" fi local d d=$(mktemp -d) cat > "$d/meta-data" < "$d/user-data" </dev/null 2>&1; then mkisofs -quiet -output "$SEED_ISO" -volid cidata -joliet -rock \ "$d/user-data" "$d/meta-data" elif command -v xorriso >/dev/null 2>&1; then xorriso -as mkisofs -quiet -output "$SEED_ISO" -volid cidata \ -joliet -rock "$d/user-data" "$d/meta-data" else log "seed: no mkisofs/xorriso — SSH key injection unavailable" SEED_ISO="" fi rm -rf "$d" [[ -f $SEED_ISO ]] && log "seed: built $SEED_ISO" } # Launch QEMU in the background. Returns once the monitor socket is alive. launch_vm() { local iso=$1 log "launching QEMU" local seed_args=() [[ -n $SEED_ISO && -f $SEED_ISO ]] && \ seed_args=(-drive "file=$SEED_ISO,media=cdrom,readonly=on") qemu-system-x86_64 \ -name veilor-auto-install \ -enable-kvm \ -cpu host \ -smp 4 \ -m 4096 \ -machine q35,smm=on \ -global driver=cfi.pflash01,property=secure,value=on \ -drive if=pflash,format=raw,readonly=on,file="$OVMF_CODE" \ -drive if=pflash,format=raw,file="$NVRAM" \ -drive file="$DISK",if=virtio,format=qcow2,cache=writeback \ -drive file="$iso",media=cdrom,readonly=on \ "${seed_args[@]}" \ -monitor "unix:$MONITOR_SOCK,server,nowait" \ -boot order=dc,menu=off \ -netdev user,id=net0,hostfwd=tcp::${SSH_PORT}-:22 \ -device virtio-net-pci,netdev=net0 \ -device virtio-rng-pci \ -vga virtio \ -display none \ -serial "file:$SERIAL_LOG" \ >>"$LOG" 2>&1 & QEMU_PID=$! log "QEMU pid=$QEMU_PID" km_wait_socket "$MONITOR_SOCK" "$WAIT_MONITOR_S" \ || fail "monitor socket never opened" log "monitor socket ready" } # Did QEMU die? Used at every poll; lets us bail with a useful message # instead of waiting out the full timeout. qemu_alive() { [[ -n "${QEMU_PID:-}" ]] && kill -0 "$QEMU_PID" 2>/dev/null } # ── Driver: walk the installer flow ──────────────────────────────────── # Take a numbered screenshot. Auto-increments NN. SHOT_N=0 shot() { local label=$1 SHOT_N=$((SHOT_N + 1)) local file file=$(printf '%s-%02d-%s.png' "$SHOT_PREFIX" "$SHOT_N" "$label") km_screendump "$MONITOR_SOCK" "$file" log "screenshot: $file" } drive_installer() { log "waiting ${WAIT_INSTALLER_BANNER_S}s for ISO boot + tty1 installer" # The live ISO autologs into multi-user.target, runs gum on tty1 via a # systemd unit that replaces getty (see overlay/etc/systemd/system/ # veilor-installer.service if it exists; otherwise via the multi-user # default in kickstart line 250). sleep "$WAIT_INSTALLER_BANNER_S" qemu_alive || fail "QEMU died during ISO boot" shot "boot-banner" # Make absolutely sure we're on tty1 (the live ks sets multi-user.target # default, so we should already be there — but a stray graphical.target # on dev builds would silently swallow our keystrokes). km_send_chord "$MONITOR_SOCK" ctrl alt f1 sleep "$WAIT_AFTER_INPUT_S" shot "tty1" # Step 1: top option = "Install" — gum choose has it pre-selected. log "step: select Install" km_send_key "$MONITOR_SOCK" ret sleep "$WAIT_GUM_PROMPT_S" shot "after-install-pick" # Step 2: disk select — only /dev/vda exists in this QEMU. Default # selection = first row. log "step: select disk (/dev/vda — only one)" km_send_key "$MONITOR_SOCK" ret sleep "$WAIT_GUM_PROMPT_S" shot "after-disk-pick" # Step 3: LUKS passphrase. gum input --password reads stdin until newline. log "step: enter LUKS passphrase" km_send_str "$MONITOR_SOCK" "$LUKS_PW" sleep 1 km_send_key "$MONITOR_SOCK" ret sleep "$WAIT_AFTER_INPUT_S" shot "after-luks-pw" # Step 4: admin password. log "step: enter admin password" km_send_str "$MONITOR_SOCK" "$ADMIN_PW" sleep 1 km_send_key "$MONITOR_SOCK" ret sleep "$WAIT_AFTER_INPUT_S" shot "after-admin-pw" # Step 5: locale select — first option = en_GB.UTF-8. log "step: confirm locale (en_GB.UTF-8)" km_send_key "$MONITOR_SOCK" ret sleep "$WAIT_GUM_PROMPT_S" shot "after-locale" # Step 6: confirm screen. gum confirm defaults to "Yes" focused → # Enter accepts. (Verified against gum 0.13+ docs; if defaults change # in a future gum, swap to explicit "y" via key map.) log "step: confirm install" km_send_key "$MONITOR_SOCK" ret sleep "$WAIT_AFTER_INPUT_S" shot "after-confirm" log "installer driven: anaconda should now be running" } # Quick non-blocking SSH probe. Returns 0 if reachable. ssh_alive() { ssh -p $SSH_PORT \ -o StrictHostKeyChecking=no \ -o UserKnownHostsFile=/dev/null \ -o ConnectTimeout=3 \ -o BatchMode=yes \ ${TEST_KEY:+-i "$TEST_KEY"} \ "$SSH_USER@127.0.0.1" true 2>/dev/null } # Poll for anaconda completion + SSH availability. We can't watch QEMU exit # (anaconda's `reboot` directive triggers systemctl reboot, which doesn't # poweroff the VM — it boots back into the installed disk). The signal we # actually trust is SSH on port 2222 starting to answer. # # If cloud-init didn't run (the seed ISO might not have been picked up by # anaconda's installed system, depending on whether /etc/cloud is in the # installed package set), SSH will never come up via key auth. The fallback # in tty1_unlock_ssh() drives the SDDM/console login by hand. wait_for_install_and_reboot() { log "waiting up to ${ANACONDA_TIMEOUT_S}s for anaconda + reboot + SSH" local waited=0 last_shot=0 last_ppm_hash="" same_count=0 while (( waited < ANACONDA_TIMEOUT_S )); do if ! qemu_alive; then fail "QEMU exited unexpectedly during install (check $SERIAL_LOG)" fi # SSH probe — first PASS exits the loop. if ssh_alive; then log "SSH up — installed system reachable" shot "ssh-up" return 0 fi # Periodic screenshot + stuck-screen detection. if (( waited - last_shot >= ANACONDA_POLL_S )); then local ppm="$SHOT_PREFIX-poll.ppm" km_monitor_send "$MONITOR_SOCK" "screendump $ppm" sleep 1 if [[ -f $ppm ]]; then local h h=$(sha256sum "$ppm" 2>/dev/null | cut -d' ' -f1) if [[ -n $last_ppm_hash && $h == "$last_ppm_hash" ]]; then same_count=$((same_count + 1)) else same_count=0 fi last_ppm_hash=$h rm -f "$ppm" fi # 5 minutes of identical frames = stuck. Anaconda's text-mode # progress refreshes at least every minute, so 10 frames in a # row (5 min @ 30s cadence) identical means it's wedged. if (( same_count >= 10 )); then shot "stuck" fail "screen unchanged for 5min — anaconda likely crashed" fi last_shot=$waited log "anaconda still running... (${waited}s elapsed)" fi sleep 5 waited=$((waited + 5)) done shot "ssh-timeout" log "SSH never came up via cloud-init; trying TTY1 fallback" if tty1_unlock_ssh; then log "TTY1 fallback succeeded; SSH should be reachable" return 0 fi fail "anaconda did not complete + SSH within ${ANACONDA_TIMEOUT_S}s, TTY1 fallback also failed" } # TTY1 fallback: the installed system reached SDDM (graphical) or got stuck # at LUKS prompt. We drop to a TTY, log in as admin (chage forces password # change on first use), and undo the sshd hardening so our pubkey works. # # This is best-effort. If the LUKS prompt is still up — we can't get past # it without typing the passphrase, which we do here too. tty1_unlock_ssh() { log "TTY1 fallback: typing LUKS passphrase + admin login + opening sshd" # Switch to tty1 in case SDDM grabbed graphical. km_send_chord "$MONITOR_SOCK" ctrl alt f3 sleep 3 # If we're at LUKS prompt, the passphrase clears it. If we're already # past LUKS, this is a harmless garbage on the login prompt — we Enter # to clear, then proceed with login. km_send_str "$MONITOR_SOCK" "$LUKS_PW" km_send_key "$MONITOR_SOCK" ret sleep 30 # cryptsetup unlock + boot to login prompt shot "tty3-prelogin" # Username — admin. chage -d 0 means we'll be prompted to change pw on # first login. The old password is whatever we typed at install time; # the new password just has to satisfy PAM minlen — reuse $ADMIN_PW # and add a "1" suffix to make passwd's "must differ" check happy. km_send_line "$MONITOR_SOCK" "admin" sleep 3 km_send_line "$MONITOR_SOCK" "$ADMIN_PW" sleep 5 # Old pw prompt (chage forced). km_send_line "$MONITOR_SOCK" "$ADMIN_PW" sleep 2 # New pw twice. Use a derivative; PAM rejects identical-to-old and we # don't want to surprise the user with a password change. km_send_line "$MONITOR_SOCK" "${ADMIN_PW}new" sleep 1 km_send_line "$MONITOR_SOCK" "${ADMIN_PW}new" sleep 5 shot "tty3-loggedin" # Inject host pubkey + remove sshd hardening + reload sshd. local pubkey="" if [[ -n "${TEST_KEY:-}" && -f "${TEST_KEY}.pub" ]]; then pubkey=$(< "${TEST_KEY}.pub") fi if [[ -z $pubkey ]]; then log "TTY1 fallback: no pubkey to inject — cannot recover SSH" return 1 fi km_send_line "$MONITOR_SOCK" "mkdir -p ~/.ssh && chmod 700 ~/.ssh" sleep 1 km_send_line "$MONITOR_SOCK" "echo '$pubkey' >> ~/.ssh/authorized_keys" sleep 1 km_send_line "$MONITOR_SOCK" "chmod 600 ~/.ssh/authorized_keys" sleep 1 km_send_line "$MONITOR_SOCK" "echo '${ADMIN_PW}new' | sudo -S rm -f /etc/ssh/sshd_config.d/10-veilor-hardening.conf" sleep 2 km_send_line "$MONITOR_SOCK" "echo '${ADMIN_PW}new' | sudo -S systemctl reload sshd" sleep 5 # Wait up to 60s for SSH to actually answer. local i for ((i=0; i<60; i++)); do if ssh_alive; then log "TTY1 fallback: SSH reachable after ${i}s" return 0 fi sleep 1 done return 1 } # ── Validation ───────────────────────────────────────────────────────── # Run a single SSH command, return its stdout. Failures are NOT fatal here # — caller decides what's a hard failure. remote() { ssh -p $SSH_PORT \ -o StrictHostKeyChecking=no \ -o UserKnownHostsFile=/dev/null \ -o BatchMode=yes \ ${TEST_KEY:+-i "$TEST_KEY"} \ "$SSH_USER@127.0.0.1" "$@" } # Validation result accumulator. check_remote runs a shell snippet on the # installed VM via SSH; the snippet must exit 0 for PASS, non-zero for # FAIL. check_eq compares remote stdout to an expected literal. VALIDATIONS=() # check_remote # Runs the snippet via SSH, treats exit code as the verdict. check_remote() { local desc=$1 cmd=$2 local out rc out=$(remote "$cmd" 2>&1) rc=$? if (( rc == 0 )); then VALIDATIONS+=("PASS $desc") log " PASS: $desc" else # Truncate the failure context so the report stays scannable. local trimmed=${out:0:120} VALIDATIONS+=("FAIL $desc ($trimmed)") log " FAIL: $desc -- $trimmed" fi } # check_eq # Runs the snippet, trims trailing whitespace, compares to expected. check_eq() { local desc=$1 cmd=$2 expected=$3 local got got=$(remote "$cmd" 2>/dev/null | tr -d '\r' | tail -n1) got=${got%[[:space:]]} if [[ $got == "$expected" ]]; then VALIDATIONS+=("PASS $desc (=$got)") log " PASS: $desc (=$got)" else VALIDATIONS+=("FAIL $desc (got: '$got', expected: '$expected')") log " FAIL: $desc -- got '$got' expected '$expected'" fi } run_validation() { log "running validation checklist" # os-release check_remote "/etc/os-release: NAME=veilor-os" \ 'grep -q "^NAME=.veilor-os" /etc/os-release' check_eq "hostnamectl --static = veilor" \ 'hostnamectl --static' "veilor" # Active services for svc in sshd fail2ban usbguard tuned auditd firewalld chronyd sddm; do check_eq "$svc is-active" \ "systemctl is-active $svc" "active" done # SELinux. v0.5.x kickstart sets `selinux --enforcing` for installed # systems but veilor-firstboot may toggle behavior — accept either # Enforcing or Permissive, but log which one we got. (Hard-fail on # Disabled.) local selinux selinux=$(remote getenforce 2>/dev/null | tr -d '\r' | tail -n1) selinux=${selinux%[[:space:]]} if [[ $selinux == Enforcing ]]; then VALIDATIONS+=("PASS SELinux = Enforcing") log " PASS: SELinux = Enforcing" elif [[ $selinux == Permissive ]]; then VALIDATIONS+=("PASS SELinux = Permissive (acceptable for v0.5)") log " PASS (soft): SELinux = Permissive" else VALIDATIONS+=("FAIL SELinux = $selinux") log " FAIL: SELinux = $selinux" fi # Disk layout: LUKS2 + btrfs. check_remote "lsblk shows crypto_LUKS" \ 'lsblk -f | grep -q crypto_LUKS' check_remote "lsblk shows btrfs" \ 'lsblk -f | grep -q btrfs' check_remote "/etc/crypttab has LUKS entry" \ 'grep -Ev "^\s*(#|$)" /etc/crypttab | grep -qi luks' # Admin user check_remote "admin user exists" \ 'getent passwd admin | grep -q "^admin:"' # CLI tools shipped via overlay. for bin in veilor-power veilor-doctor veilor-update; do check_remote "/usr/local/bin/$bin present" \ "test -x /usr/local/bin/$bin" done # init_on_alloc — veilor-installer kickstart sets it on the install # cmdline (line 315). /proc/cmdline is the source of truth. check_remote "init_on_alloc=1 in /proc/cmdline" \ 'grep -q init_on_alloc=1 /proc/cmdline' } # ── Reporting ────────────────────────────────────────────────────────── print_report() { local pass=0 fail=0 for line in "${VALIDATIONS[@]}"; do case "$line" in PASS*) pass=$((pass + 1)) ;; FAIL*) fail=$((fail + 1)) ;; esac done { echo "════════════════════════════════════════════════════════" echo " veilor-os auto-install test report" echo " $(date)" echo "════════════════════════════════════════════════════════" printf '%s\n' "${VALIDATIONS[@]}" echo "────────────────────────────────────────────────────────" printf 'TOTAL: %d PASS, %d FAIL\n' "$pass" "$fail" echo "Logs: $LOG" echo "Screenshots: ${SHOT_PREFIX}-NN-*.png" echo "Serial log: $SERIAL_LOG" echo "════════════════════════════════════════════════════════" } | tee -a "$LOG" # Capture a final SSH session snapshot (uname/lsblk/sysctl) for the # human reviewer. { echo "=== final ssh probe ===" date echo "--- uname -a ---" remote uname -a 2>&1 echo "--- lsblk -f ---" remote lsblk -f 2>&1 echo "--- /proc/cmdline ---" remote cat /proc/cmdline 2>&1 echo "--- systemctl --failed ---" remote systemctl --failed 2>&1 } > "${SHOT_PREFIX}-final-ssh.txt" 2>&1 || true log "final ssh snapshot: ${SHOT_PREFIX}-final-ssh.txt" if (( fail > 0 )); then return 1 fi return 0 } cleanup() { log "cleanup" if [[ -n "${QEMU_PID:-}" ]] && kill -0 "$QEMU_PID" 2>/dev/null; then # Graceful shutdown via monitor first; SIGTERM if it ignores us. km_monitor_send "$MONITOR_SOCK" "system_powerdown" 2>/dev/null || true sleep 5 if kill -0 "$QEMU_PID" 2>/dev/null; then kill "$QEMU_PID" 2>/dev/null || true sleep 2 kill -9 "$QEMU_PID" 2>/dev/null || true fi fi rm -f "$MONITOR_SOCK" } # ── Main ─────────────────────────────────────────────────────────────── main() { trap cleanup EXIT preflight "$@" kill_existing_vm wipe_state build_seed_iso launch_vm "$ISO" drive_installer wait_for_install_and_reboot run_validation print_report } main "$@"