veilor-os/test/auto-install.sh

#!/usr/bin/env bash
# auto-install.sh — autonomous end-to-end install test for veilor-os.
#
# Boots a fresh ISO under QEMU, drives the gum installer via the QEMU
# monitor (sendkey events), waits for anaconda to finish + reboot, SSHes
# into the installed system, and runs a validation checklist.
#
# Usage:
#   ./test/auto-install.sh path/to/veilor-os-*.iso
#
# Expected runtime:
#   * boot + drive installer:   ~3 min
#   * anaconda install (KDE):   ~15-25 min (depends on mirrors + host CPU)
#   * reboot + SSH up:          ~2 min
#   * validation checks:        <1 min
#   * total:                    20-30 min wall clock
#
# Hardcoded test inputs (do NOT edit — meant to be deterministic):
#   disk        first /dev/vda (only disk in QEMU)
#   hostname    "veilor"  (installer hardcodes this in v0.5.4)
#   LUKS pw     testpass1234
#   admin pw    adminpass1234
#   locale      en_GB.UTF-8 (first option, accepted with Enter)
#
# Outputs:
#   /tmp/veilor-auto-install.log               — full driver log
#   /tmp/veilor-auto-install-NN-<step>.png     — milestone screenshots
#   /tmp/veilor-auto-install-final-ssh.txt     — final SSH session capture
#
# Exit codes:
#   0 = all validation checks passed
#   1 = any failure (anaconda crash, SSH never up, validation failed)
#   2 = preflight failure (missing tool, bad ISO arg)
#
# This script intentionally does not source test/run-vm.sh — it needs a
# different QEMU configuration (no live cloud-init seed since we're driving
# the installed-system path), and run-vm.sh `exec`s qemu, which is
# incompatible with running QEMU as a backgrounded child here.

set -uo pipefail

# ── Constants ──────────────────────────────────────────────────────────
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
TEST_DIR="$SCRIPT_DIR"
DISK="$TEST_DIR/auto-install-vm.qcow2"
NVRAM="$TEST_DIR/auto-install-vm.nvram"
MONITOR_SOCK="$TEST_DIR/auto-install-vm.monitor.sock"
SERIAL_LOG="$TEST_DIR/auto-install-vm.serial.log"
SEED_ISO="$TEST_DIR/auto-install-seed.iso"

LOG=/tmp/veilor-auto-install.log
SHOT_PREFIX=/tmp/veilor-auto-install
SSH_PORT=2222
SSH_USER=admin

LUKS_PW="testpass1234"
ADMIN_PW="adminpass1234"

# Disk: 40G is enough headroom — KDE base + 8G LUKS + LVM overhead fits in
# ~12G actual, but qcow2 only allocates blocks that get touched.
DISK_SIZE=40G

# OVMF firmware paths — Fedora layout. Caller can override if needed.
OVMF_CODE="${OVMF_CODE:-/usr/share/edk2/ovmf/OVMF_CODE.fd}"
OVMF_VARS_SRC="${OVMF_VARS_SRC:-/usr/share/edk2/ovmf/OVMF_VARS.fd}"

# Timing knobs — coarse but deliberate. Tighten only after observing slack
# on a real run.
WAIT_MONITOR_S=120        # qemu boot to monitor socket alive
WAIT_INSTALLER_BANNER_S=180   # ISO boot → tty1 gum menu visible
WAIT_GUM_PROMPT_S=8       # gum draws each prompt within ~5s
WAIT_AFTER_INPUT_S=3      # let UI advance after we hit Enter
ANACONDA_TIMEOUT_S=2700   # 45 min — anaconda + reboot + SSH come-up
ANACONDA_POLL_S=30        # screenshot/poll cadence during install

# ── Logging ────────────────────────────────────────────────────────────
: > "$LOG"
log() { printf '[%s] %s\n' "$(date +'%H:%M:%S')" "$*" | tee -a "$LOG"; }
fail() { log "FAIL: $*"; exit 1; }

# Source the keymap helper.
# shellcheck source=auto-install-keymap.sh
. "$SCRIPT_DIR/auto-install-keymap.sh"

# ── Preflight ──────────────────────────────────────────────────────────
preflight() {
    log "preflight: checking environment"

    ISO="${1:-}"
    if [[ -z $ISO ]]; then
        echo "Usage: $0 <path-to-veilor-os.iso>" >&2
        exit 2
    fi
    if [[ ! -f $ISO ]]; then
        echo "[ERR] ISO not found: $ISO" >&2
        exit 2
    fi

    km_require_tools || exit 2
    for t in ssh ssh-keygen pgrep pkill; do
        command -v "$t" >/dev/null 2>&1 || { echo "[ERR] missing $t" >&2; exit 2; }
    done

    if [[ ! -f $OVMF_CODE ]]; then
        echo "[ERR] OVMF firmware missing: $OVMF_CODE (install edk2-ovmf)" >&2
        exit 2
    fi

    log "preflight: ISO=$ISO"
}

# ── VM lifecycle ───────────────────────────────────────────────────────

# Kill any QEMU we previously started + scrub state files. Idempotent.
kill_existing_vm() {
    log "killing any existing auto-install QEMU"
    if [[ -n "${QEMU_PID:-}" ]] && kill -0 "$QEMU_PID" 2>/dev/null; then
        kill "$QEMU_PID" 2>/dev/null || true
        sleep 2
        kill -9 "$QEMU_PID" 2>/dev/null || true
    fi
    # Catch orphans from prior runs — match by disk path so we don't kill
    # the user's other QEMU VMs.
    pkill -f "qemu-system-x86_64.*$DISK" 2>/dev/null || true
    rm -f "$MONITOR_SOCK" "$SERIAL_LOG"
}

# Wipe disk + nvram so each run is reproducible.
wipe_state() {
    log "wiping qcow2 + nvram"
    rm -f "$DISK" "$NVRAM" "$SEED_ISO"
    qemu-img create -f qcow2 "$DISK" "$DISK_SIZE" >/dev/null
    cp "$OVMF_VARS_SRC" "$NVRAM"
}

# Build a NoCloud cloud-init seed ISO so anaconda's installed system picks
# up our SSH pubkey on first boot. The installer-generated ks doesn't
# explicitly invoke cloud-init, but Fedora ships cloud-init enabled by
# default in @core; if a cidata seed is present at boot, NoCloud datasource
# fires and we get key injection for free.
build_seed_iso() {
    local pubkey="" found=""
    for cand in "$HOME/.ssh/id_ed25519.pub" "$HOME/.ssh/id_rsa.pub"; do
        if [[ -f $cand ]]; then
            pubkey="$(< "$cand")"
            found=$cand
            break
        fi
    done
    if [[ -z $pubkey ]]; then
        log "seed: no host SSH pubkey found at ~/.ssh/id_{ed25519,rsa}.pub"
        log "seed: generating throwaway test key"
        local key=$TEST_DIR/auto-install-id_ed25519
        rm -f "$key" "$key.pub"
        ssh-keygen -t ed25519 -N '' -f "$key" -C "veilor-auto-install" >/dev/null
        pubkey="$(< "$key.pub")"
        TEST_KEY="$key"
    else
        log "seed: using $found"
        # Match host id; assume corresponding private key exists alongside.
        TEST_KEY="${found%.pub}"
    fi

    local d
    d=$(mktemp -d)
    cat > "$d/meta-data" <<EOF
instance-id: veilor-auto-install
local-hostname: veilor
EOF
    cat > "$d/user-data" <<EOF
#cloud-config
users:
  - name: admin
    ssh_authorized_keys:
      - $pubkey
    lock_passwd: false
ssh_pwauth: true
runcmd:
  - rm -f /etc/ssh/sshd_config.d/10-veilor-hardening.conf
  - systemctl reload sshd || systemctl restart sshd || true
EOF
    if command -v mkisofs >/dev/null 2>&1; then
        mkisofs -quiet -output "$SEED_ISO" -volid cidata -joliet -rock \
            "$d/user-data" "$d/meta-data"
    elif command -v xorriso >/dev/null 2>&1; then
        xorriso -as mkisofs -quiet -output "$SEED_ISO" -volid cidata \
            -joliet -rock "$d/user-data" "$d/meta-data"
    else
        log "seed: no mkisofs/xorriso — SSH key injection unavailable"
        SEED_ISO=""
    fi
    rm -rf "$d"
    [[ -f $SEED_ISO ]] && log "seed: built $SEED_ISO"
}

# Launch QEMU in the background. Returns once the monitor socket is alive.
launch_vm() {
    local iso=$1
    log "launching QEMU"

    local seed_args=()
    [[ -n $SEED_ISO && -f $SEED_ISO ]] && \
        seed_args=(-drive "file=$SEED_ISO,media=cdrom,readonly=on")

    qemu-system-x86_64 \
        -name veilor-auto-install \
        -enable-kvm \
        -cpu host \
        -smp 4 \
        -m 4096 \
        -machine q35,smm=on \
        -global driver=cfi.pflash01,property=secure,value=on \
        -drive if=pflash,format=raw,readonly=on,file="$OVMF_CODE" \
        -drive if=pflash,format=raw,file="$NVRAM" \
        -drive file="$DISK",if=virtio,format=qcow2,cache=writeback \
        -drive file="$iso",media=cdrom,readonly=on \
        "${seed_args[@]}" \
        -monitor "unix:$MONITOR_SOCK,server,nowait" \
        -boot order=dc,menu=off \
        -netdev user,id=net0,hostfwd=tcp::${SSH_PORT}-:22 \
        -device virtio-net-pci,netdev=net0 \
        -device virtio-rng-pci \
        -vga virtio \
        -display none \
        -serial "file:$SERIAL_LOG" \
        >>"$LOG" 2>&1 &
    QEMU_PID=$!
    log "QEMU pid=$QEMU_PID"

    km_wait_socket "$MONITOR_SOCK" "$WAIT_MONITOR_S" \
        || fail "monitor socket never opened"
    log "monitor socket ready"
}

# Did QEMU die? Used at every poll; lets us bail with a useful message
# instead of waiting out the full timeout.
qemu_alive() {
    [[ -n "${QEMU_PID:-}" ]] && kill -0 "$QEMU_PID" 2>/dev/null
}

# ── Driver: walk the installer flow ────────────────────────────────────

# Take a numbered screenshot. Auto-increments NN.
SHOT_N=0
shot() {
    local label=$1
    SHOT_N=$((SHOT_N + 1))
    local file
    file=$(printf '%s-%02d-%s.png' "$SHOT_PREFIX" "$SHOT_N" "$label")
    km_screendump "$MONITOR_SOCK" "$file"
    log "screenshot: $file"
}

drive_installer() {
    log "waiting ${WAIT_INSTALLER_BANNER_S}s for ISO boot + tty1 installer"

    # The live ISO autologs into multi-user.target, runs gum on tty1 via a
    # systemd unit that replaces getty (see overlay/etc/systemd/system/
    # veilor-installer.service if it exists; otherwise via the multi-user
    # default in kickstart line 250).
    sleep "$WAIT_INSTALLER_BANNER_S"
    qemu_alive || fail "QEMU died during ISO boot"
    shot "boot-banner"

    # Make absolutely sure we're on tty1 (the live ks sets multi-user.target
    # default, so we should already be there — but a stray graphical.target
    # on dev builds would silently swallow our keystrokes).
    km_send_chord "$MONITOR_SOCK" ctrl alt f1
    sleep "$WAIT_AFTER_INPUT_S"
    shot "tty1"

    # Step 1: top option = "Install" — gum choose has it pre-selected.
    log "step: select Install"
    km_send_key "$MONITOR_SOCK" ret
    sleep "$WAIT_GUM_PROMPT_S"
    shot "after-install-pick"

    # Step 2: disk select — only /dev/vda exists in this QEMU. Default
    # selection = first row.
    log "step: select disk (/dev/vda — only one)"
    km_send_key "$MONITOR_SOCK" ret
    sleep "$WAIT_GUM_PROMPT_S"
    shot "after-disk-pick"

    # Step 3: LUKS passphrase. gum input --password reads stdin until newline.
    log "step: enter LUKS passphrase"
    km_send_str "$MONITOR_SOCK" "$LUKS_PW"
    sleep 1
    km_send_key "$MONITOR_SOCK" ret
    sleep "$WAIT_AFTER_INPUT_S"
    shot "after-luks-pw"

    # Step 4: admin password.
    log "step: enter admin password"
    km_send_str "$MONITOR_SOCK" "$ADMIN_PW"
    sleep 1
    km_send_key "$MONITOR_SOCK" ret
    sleep "$WAIT_AFTER_INPUT_S"
    shot "after-admin-pw"

    # Step 5: locale select — first option = en_GB.UTF-8.
    log "step: confirm locale (en_GB.UTF-8)"
    km_send_key "$MONITOR_SOCK" ret
    sleep "$WAIT_GUM_PROMPT_S"
    shot "after-locale"

    # Step 6: confirm screen. gum confirm defaults to "Yes" focused →
    # Enter accepts. (Verified against gum 0.13+ docs; if defaults change
    # in a future gum, swap to explicit "y" via key map.)
    log "step: confirm install"
    km_send_key "$MONITOR_SOCK" ret
    sleep "$WAIT_AFTER_INPUT_S"
    shot "after-confirm"

    log "installer driven: anaconda should now be running"
}

# Quick non-blocking SSH probe. Returns 0 if reachable.
ssh_alive() {
    ssh -p $SSH_PORT \
        -o StrictHostKeyChecking=no \
        -o UserKnownHostsFile=/dev/null \
        -o ConnectTimeout=3 \
        -o BatchMode=yes \
        ${TEST_KEY:+-i "$TEST_KEY"} \
        "$SSH_USER@127.0.0.1" true 2>/dev/null
}

# Poll for anaconda completion + SSH availability. We can't watch QEMU exit
# (anaconda's `reboot` directive triggers systemctl reboot, which doesn't
# poweroff the VM — it boots back into the installed disk). The signal we
# actually trust is SSH on port 2222 starting to answer.
#
# If cloud-init didn't run (the seed ISO might not have been picked up by
# anaconda's installed system, depending on whether /etc/cloud is in the
# installed package set), SSH will never come up via key auth. The fallback
# in tty1_unlock_ssh() drives the SDDM/console login by hand.
wait_for_install_and_reboot() {
    log "waiting up to ${ANACONDA_TIMEOUT_S}s for anaconda + reboot + SSH"

    local waited=0 last_shot=0 last_ppm_hash="" same_count=0
    while (( waited < ANACONDA_TIMEOUT_S )); do
        if ! qemu_alive; then
            fail "QEMU exited unexpectedly during install (check $SERIAL_LOG)"
        fi

        # SSH probe — first PASS exits the loop.
        if ssh_alive; then
            log "SSH up — installed system reachable"
            shot "ssh-up"
            return 0
        fi

        # Periodic screenshot + stuck-screen detection.
        if (( waited - last_shot >= ANACONDA_POLL_S )); then
            local ppm="$SHOT_PREFIX-poll.ppm"
            km_monitor_send "$MONITOR_SOCK" "screendump $ppm"
            sleep 1
            if [[ -f $ppm ]]; then
                local h
                h=$(sha256sum "$ppm" 2>/dev/null | cut -d' ' -f1)
                if [[ -n $last_ppm_hash && $h == "$last_ppm_hash" ]]; then
                    same_count=$((same_count + 1))
                else
                    same_count=0
                fi
                last_ppm_hash=$h
                rm -f "$ppm"
            fi
            # 5 minutes of identical frames = stuck. Anaconda's text-mode
            # progress refreshes at least every minute, so 10 frames in a
            # row (5 min @ 30s cadence) identical means it's wedged.
            if (( same_count >= 10 )); then
                shot "stuck"
                fail "screen unchanged for 5min — anaconda likely crashed"
            fi
            last_shot=$waited
            log "anaconda still running... (${waited}s elapsed)"
        fi

        sleep 5
        waited=$((waited + 5))
    done

    shot "ssh-timeout"
    log "SSH never came up via cloud-init; trying TTY1 fallback"
    if tty1_unlock_ssh; then
        log "TTY1 fallback succeeded; SSH should be reachable"
        return 0
    fi
    fail "anaconda did not complete + SSH within ${ANACONDA_TIMEOUT_S}s, TTY1 fallback also failed"
}

# TTY1 fallback: the installed system reached SDDM (graphical) or got stuck
# at LUKS prompt. We drop to a TTY, log in as admin (chage forces password
# change on first use), and undo the sshd hardening so our pubkey works.
#
# This is best-effort. If the LUKS prompt is still up — we can't get past
# it without typing the passphrase, which we do here too.
tty1_unlock_ssh() {
    log "TTY1 fallback: typing LUKS passphrase + admin login + opening sshd"

    # Switch to tty1 in case SDDM grabbed graphical.
    km_send_chord "$MONITOR_SOCK" ctrl alt f3
    sleep 3

    # If we're at LUKS prompt, the passphrase clears it. If we're already
    # past LUKS, this is a harmless garbage on the login prompt — we Enter
    # to clear, then proceed with login.
    km_send_str "$MONITOR_SOCK" "$LUKS_PW"
    km_send_key "$MONITOR_SOCK" ret
    sleep 30  # cryptsetup unlock + boot to login prompt

    shot "tty3-prelogin"

    # Username — admin. chage -d 0 means we'll be prompted to change pw on
    # first login. The old password is whatever we typed at install time;
    # the new password just has to satisfy PAM minlen — reuse $ADMIN_PW
    # and add a "1" suffix to make passwd's "must differ" check happy.
    km_send_line "$MONITOR_SOCK" "admin"
    sleep 3
    km_send_line "$MONITOR_SOCK" "$ADMIN_PW"
    sleep 5
    # Old pw prompt (chage forced).
    km_send_line "$MONITOR_SOCK" "$ADMIN_PW"
    sleep 2
    # New pw twice. Use a derivative; PAM rejects identical-to-old and we
    # don't want to surprise the user with a password change.
    km_send_line "$MONITOR_SOCK" "${ADMIN_PW}new"
    sleep 1
    km_send_line "$MONITOR_SOCK" "${ADMIN_PW}new"
    sleep 5

    shot "tty3-loggedin"

    # Inject host pubkey + remove sshd hardening + reload sshd.
    local pubkey=""
    if [[ -n "${TEST_KEY:-}" && -f "${TEST_KEY}.pub" ]]; then
        pubkey=$(< "${TEST_KEY}.pub")
    fi
    if [[ -z $pubkey ]]; then
        log "TTY1 fallback: no pubkey to inject — cannot recover SSH"
        return 1
    fi

    km_send_line "$MONITOR_SOCK" "mkdir -p ~/.ssh && chmod 700 ~/.ssh"
    sleep 1
    km_send_line "$MONITOR_SOCK" "echo '$pubkey' >> ~/.ssh/authorized_keys"
    sleep 1
    km_send_line "$MONITOR_SOCK" "chmod 600 ~/.ssh/authorized_keys"
    sleep 1
    km_send_line "$MONITOR_SOCK" "echo '${ADMIN_PW}new' | sudo -S rm -f /etc/ssh/sshd_config.d/10-veilor-hardening.conf"
    sleep 2
    km_send_line "$MONITOR_SOCK" "echo '${ADMIN_PW}new' | sudo -S systemctl reload sshd"
    sleep 5

    # Wait up to 60s for SSH to actually answer.
    local i
    for ((i=0; i<60; i++)); do
        if ssh_alive; then
            log "TTY1 fallback: SSH reachable after ${i}s"
            return 0
        fi
        sleep 1
    done
    return 1
}

# ── Validation ─────────────────────────────────────────────────────────
# Run a single SSH command, return its stdout. Failures are NOT fatal here
# — caller decides what's a hard failure.
remote() {
    ssh -p $SSH_PORT \
        -o StrictHostKeyChecking=no \
        -o UserKnownHostsFile=/dev/null \
        -o BatchMode=yes \
        ${TEST_KEY:+-i "$TEST_KEY"} \
        "$SSH_USER@127.0.0.1" "$@"
}

# Validation result accumulator. check_remote runs a shell snippet on the
# installed VM via SSH; the snippet must exit 0 for PASS, non-zero for
# FAIL. check_eq compares remote stdout to an expected literal.
VALIDATIONS=()

# check_remote <desc> <remote shell snippet>
# Runs the snippet via SSH, treats exit code as the verdict.
check_remote() {
    local desc=$1 cmd=$2
    local out rc
    out=$(remote "$cmd" 2>&1)
    rc=$?
    if (( rc == 0 )); then
        VALIDATIONS+=("PASS  $desc")
        log "  PASS: $desc"
    else
        # Truncate the failure context so the report stays scannable.
        local trimmed=${out:0:120}
        VALIDATIONS+=("FAIL  $desc  ($trimmed)")
        log "  FAIL: $desc -- $trimmed"
    fi
}

# check_eq <desc> <remote shell snippet> <expected stdout>
# Runs the snippet, trims trailing whitespace, compares to expected.
check_eq() {
    local desc=$1 cmd=$2 expected=$3
    local got
    got=$(remote "$cmd" 2>/dev/null | tr -d '\r' | tail -n1)
    got=${got%[[:space:]]}
    if [[ $got == "$expected" ]]; then
        VALIDATIONS+=("PASS  $desc (=$got)")
        log "  PASS: $desc (=$got)"
    else
        VALIDATIONS+=("FAIL  $desc  (got: '$got', expected: '$expected')")
        log "  FAIL: $desc -- got '$got' expected '$expected'"
    fi
}

run_validation() {
    log "running validation checklist"

    # os-release
    check_remote "/etc/os-release: NAME=veilor-os" \
        'grep -q "^NAME=.veilor-os" /etc/os-release'

    check_eq "hostnamectl --static = veilor" \
        'hostnamectl --static' "veilor"

    # Active services
    for svc in sshd fail2ban usbguard tuned auditd firewalld chronyd sddm; do
        check_eq "$svc is-active" \
            "systemctl is-active $svc" "active"
    done

    # SELinux. v0.5.x kickstart sets `selinux --enforcing` for installed
    # systems but veilor-firstboot may toggle behavior — accept either
    # Enforcing or Permissive, but log which one we got. (Hard-fail on
    # Disabled.)
    local selinux
    selinux=$(remote getenforce 2>/dev/null | tr -d '\r' | tail -n1)
    selinux=${selinux%[[:space:]]}
    if [[ $selinux == Enforcing ]]; then
        VALIDATIONS+=("PASS  SELinux = Enforcing")
        log "  PASS: SELinux = Enforcing"
    elif [[ $selinux == Permissive ]]; then
        VALIDATIONS+=("PASS  SELinux = Permissive (acceptable for v0.5)")
        log "  PASS (soft): SELinux = Permissive"
    else
        VALIDATIONS+=("FAIL  SELinux = $selinux")
        log "  FAIL: SELinux = $selinux"
    fi

    # Disk layout: LUKS2 + btrfs.
    check_remote "lsblk shows crypto_LUKS" \
        'lsblk -f | grep -q crypto_LUKS'
    check_remote "lsblk shows btrfs" \
        'lsblk -f | grep -q btrfs'
    check_remote "/etc/crypttab has LUKS entry" \
        'grep -Ev "^\s*(#|$)" /etc/crypttab | grep -qi luks'

    # Admin user
    check_remote "admin user exists" \
        'getent passwd admin | grep -q "^admin:"'

    # CLI tools shipped via overlay.
    for bin in veilor-power veilor-doctor veilor-update; do
        check_remote "/usr/local/bin/$bin present" \
            "test -x /usr/local/bin/$bin"
    done

    # init_on_alloc — veilor-installer kickstart sets it on the install
    # cmdline (line 315). /proc/cmdline is the source of truth.
    check_remote "init_on_alloc=1 in /proc/cmdline" \
        'grep -q init_on_alloc=1 /proc/cmdline'
}

# ── Reporting ──────────────────────────────────────────────────────────
print_report() {
    local pass=0 fail=0
    for line in "${VALIDATIONS[@]}"; do
        case "$line" in
            PASS*) pass=$((pass + 1)) ;;
            FAIL*) fail=$((fail + 1)) ;;
        esac
    done

    {
        echo "════════════════════════════════════════════════════════"
        echo " veilor-os auto-install test report"
        echo " $(date)"
        echo "════════════════════════════════════════════════════════"
        printf '%s\n' "${VALIDATIONS[@]}"
        echo "────────────────────────────────────────────────────────"
        printf 'TOTAL: %d PASS, %d FAIL\n' "$pass" "$fail"
        echo "Logs:        $LOG"
        echo "Screenshots: ${SHOT_PREFIX}-NN-*.png"
        echo "Serial log:  $SERIAL_LOG"
        echo "════════════════════════════════════════════════════════"
    } | tee -a "$LOG"

    # Capture a final SSH session snapshot (uname/lsblk/sysctl) for the
    # human reviewer.
    {
        echo "=== final ssh probe ==="
        date
        echo "--- uname -a ---"
        remote uname -a 2>&1
        echo "--- lsblk -f ---"
        remote lsblk -f 2>&1
        echo "--- /proc/cmdline ---"
        remote cat /proc/cmdline 2>&1
        echo "--- systemctl --failed ---"
        remote systemctl --failed 2>&1
    } > "${SHOT_PREFIX}-final-ssh.txt" 2>&1 || true
    log "final ssh snapshot: ${SHOT_PREFIX}-final-ssh.txt"

    if (( fail > 0 )); then
        return 1
    fi
    return 0
}

cleanup() {
    log "cleanup"
    if [[ -n "${QEMU_PID:-}" ]] && kill -0 "$QEMU_PID" 2>/dev/null; then
        # Graceful shutdown via monitor first; SIGTERM if it ignores us.
        km_monitor_send "$MONITOR_SOCK" "system_powerdown" 2>/dev/null || true
        sleep 5
        if kill -0 "$QEMU_PID" 2>/dev/null; then
            kill "$QEMU_PID" 2>/dev/null || true
            sleep 2
            kill -9 "$QEMU_PID" 2>/dev/null || true
        fi
    fi
    rm -f "$MONITOR_SOCK"
}

# ── Main ───────────────────────────────────────────────────────────────
main() {
    trap cleanup EXIT

    preflight "$@"
    kill_existing_vm
    wipe_state
    build_seed_iso
    launch_vm "$ISO"
    drive_installer
    wait_for_install_and_reboot
    run_validation
    print_report
}

main "$@"