From be761d57fcd6ba2af0ebb5c4e90a2278d3562177 Mon Sep 17 00:00:00 2001
From: veilor-org <admin@veilor.org>
Date: Sat, 2 May 2026 21:24:59 +0100
Subject: [PATCH] v0.5.5: autonomous install test harness

test/auto-install.sh boots ISO, drives gum installer via QEMU
monitor sendkey with hardcoded test answers, waits for anaconda,
reboots into installed system, SSHs in, runs validation checklist.
---
 test/README.md              |  66 ++++
 test/auto-install-keymap.sh | 167 +++++++++
 test/auto-install.sh        | 654 ++++++++++++++++++++++++++++++++++++
 3 files changed, 887 insertions(+)
 create mode 100644 test/README.md
 create mode 100755 test/auto-install-keymap.sh
 create mode 100755 test/auto-install.sh
diff --git a/test/README.md b/test/README.md
new file mode 100644
index 0000000..e322dc5
--- /dev/null
+++ b/test/README.md
@@ -0,0 +1,66 @@
+# test/
+
+Test harnesses for veilor-os ISO builds.
+
+## Files
+
+| File | Purpose |
+|------|---------|
+| `run-vm.sh` | Manual smoke test — boot the latest ISO interactively in QEMU/KVM. SSH key injection via cloud-init seed + monitor sendkey fallback for live-image login. |
+| `auto-install.sh` | **Autonomous** end-to-end install test. Boots ISO, drives the gum installer via QEMU monitor `sendkey`, waits for anaconda to finish + reboot, SSHs into the installed system, runs validation checklist. Prints PASS/FAIL summary. |
+| `auto-install-keymap.sh` | Sourced helper. Provides `km_send_str`, `km_send_chord`, `km_send_key`, `km_screendump`, `km_wait_socket`, etc. Reusable by other automation. |
+| `boot-checklist.md` | Manual post-install checklist (run on a real spare laptop). |
+
+## Running the autonomous installer test
+
+```sh
+./test/auto-install.sh build/out/veilor-os-*.iso
+```
+
+Hardcoded inputs (deterministic — do not edit during a test run):
+- Disk: first `/dev/vda` (the only disk in QEMU)
+- Hostname: `veilor` (installer hardcoded since v0.5.4)
+- LUKS passphrase: `testpass1234`
+- Admin password: `adminpass1234`
+- Locale: `en_GB.UTF-8`
+
+Expected runtime: 20–30 minutes wall clock (anaconda dominates).
+
+### Outputs
+
+- `/tmp/veilor-auto-install.log` — full driver log
+- `/tmp/veilor-auto-install-NN-<step>.png` — milestone screenshots
+- `/tmp/veilor-auto-install-final-ssh.txt` — final SSH session capture (uname/lsblk/cmdline/failed units)
+
+### Exit codes
+
+- `0` — all validation checks passed
+- `1` — any failure (anaconda crashed, SSH never came up, validation check failed)
+- `2` — preflight failure (missing tool, bad ISO arg, missing OVMF)
+
+### Prerequisites
+
+- `qemu-system-x86_64`, `qemu-img`, `socat`, `ssh`, `ssh-keygen`
+- `edk2-ovmf` (OVMF UEFI firmware at `/usr/share/edk2/ovmf/OVMF_{CODE,VARS}.fd`)
+- `mkisofs` or `xorriso` (for cloud-init seed ISO; harness falls back to TTY1 driving if seed cannot be built or cloud-init does not run on the installed system)
+- `convert` from ImageMagick (optional — converts PPM screendumps to PNG; harness keeps PPM if absent)
+- KVM access (`/dev/kvm` readable by the user)
+
+### What it validates
+
+Post-install on the booted system:
+- `/etc/os-release` → `NAME=veilor-os`
+- `hostnamectl --static` → `veilor`
+- `systemctl is-active` → `active` for `sshd fail2ban usbguard tuned auditd firewalld chronyd sddm`
+- `getenforce` → `Enforcing` (preferred) or `Permissive` (acceptable for v0.5.x)
+- `lsblk -f` shows `crypto_LUKS` + `btrfs`
+- `/etc/crypttab` has a LUKS entry
+- `getent passwd admin` returns the user
+- `/usr/local/bin/{veilor-power,veilor-doctor,veilor-update}` are present and executable
+- `/proc/cmdline` contains `init_on_alloc=1`
+
+### Troubleshooting
+
+- **Stuck at boot banner**: ISO didn't autostart `veilor-installer` on tty1. Check `serial.log` and `auto-install-vm-NN-*.png` screenshots. The harness aborts after 5 minutes of identical screen frames.
+- **SSH never up**: cloud-init may not have run on the installed system (no `cidata` mount). The harness falls back to TTY1 driving — typing the LUKS passphrase, logging in as admin, and hand-injecting the SSH key. If both paths fail, validation cannot proceed.
+- **`screendump` produces unreadable PPM**: install ImageMagick (`dnf install ImageMagick`) so the harness converts to PNG.
diff --git a/test/auto-install-keymap.sh b/test/auto-install-keymap.sh
new file mode 100755
index 0000000..56fdebd
--- /dev/null
+++ b/test/auto-install-keymap.sh
@@ -0,0 +1,167 @@
+#!/usr/bin/env bash
+# auto-install-keymap.sh — sourced helper for QEMU-monitor-driven UI automation.
+#
+# Provides a minimal but complete US-layout keymap mapping every printable
+# ASCII character to a QEMU `sendkey` chord, plus convenience wrappers for
+# typing strings, sending special keys, taking screenshots, and waiting for
+# the monitor socket to appear.
+#
+# Usage:
+#   source test/auto-install-keymap.sh
+#   MONITOR_SOCK=/path/to/sock
+#   km_wait_socket "$MONITOR_SOCK" 60
+#   km_send_str "$MONITOR_SOCK" "hello world"
+#   km_send_key "$MONITOR_SOCK" ret
+#   km_send_chord "$MONITOR_SOCK" ctrl alt f1
+#   km_screendump "$MONITOR_SOCK" /tmp/shot.ppm
+#
+# Why a separate file: other harnesses (regression suites, fuzzers) can
+# source this without dragging in the full installer test driver.
+
+# Guard against double-source.
+[[ -n "${__VEILOR_KEYMAP_LOADED:-}" ]] && return 0
+__VEILOR_KEYMAP_LOADED=1
+
+# ── Tool requirements ──────────────────────────────────────────────────
+# socat is the canonical way to talk to a unix-domain QEMU monitor.
+# nc-openbsd would also work but socat is what run-vm.sh already uses.
+km_require_tools() {
+    local missing=()
+    for t in socat qemu-img qemu-system-x86_64; do
+        command -v "$t" >/dev/null 2>&1 || missing+=("$t")
+    done
+    if [[ ${#missing[@]} -gt 0 ]]; then
+        echo "[ERR] missing required tools: ${missing[*]}" >&2
+        return 1
+    fi
+}
+
+# ── Low-level monitor I/O ──────────────────────────────────────────────
+# Send a single line of monitor input. Newlines are critical — QEMU's HMP
+# parses one command per line. Errors are swallowed: the most common cause
+# is the VM having shut down between two send_* calls, which we tolerate.
+km_monitor_send() {
+    local sock=$1; shift
+    printf '%s\n' "$*" | socat - "UNIX-CONNECT:$sock" 2>/dev/null || true
+}
+
+# Send a raw HMP command and capture any stdout response (e.g. for `info`
+# queries). Trims the QEMU monitor banner + prompt noise.
+km_monitor_query() {
+    local sock=$1; shift
+    printf '%s\n' "$*" | socat -t 1 - "UNIX-CONNECT:$sock" 2>/dev/null \
+        | sed -e 's/\r//g' -e '/^QEMU /d' -e '/^(qemu)/d' || true
+}
+
+# Wait until the monitor unix socket exists and accepts connections.
+# $2 = max wait in seconds (default 60).
+km_wait_socket() {
+    local sock=$1 max=${2:-60} waited=0
+    while (( waited < max )); do
+        if [[ -S $sock ]]; then
+            # Try a no-op query — confirms the QEMU side is actually serving.
+            if printf 'info status\n' | socat -t 1 - "UNIX-CONNECT:$sock" >/dev/null 2>&1; then
+                return 0
+            fi
+        fi
+        sleep 1
+        waited=$((waited + 1))
+    done
+    echo "[ERR] monitor socket $sock never became ready (waited ${max}s)" >&2
+    return 1
+}
+
+# ── Screenshots ────────────────────────────────────────────────────────
+# Ask QEMU to dump the current framebuffer. Output is PPM. Convert to PNG
+# with ImageMagick if available; otherwise leave PPM and warn.
+km_screendump() {
+    local sock=$1 out=$2
+    local ppm="${out%.png}.ppm"
+    km_monitor_send "$sock" "screendump $ppm"
+    sleep 1   # give QEMU a moment to flush
+    if [[ -f $ppm ]] && command -v convert >/dev/null 2>&1; then
+        convert "$ppm" "$out" 2>/dev/null && rm -f "$ppm"
+    fi
+}
+
+# ── Key tables ─────────────────────────────────────────────────────────
+# QEMU `sendkey` reference: docs/system/keys.html.in. The HMP names are
+# the X11 keysym lower-case, with a few exceptions for non-letter keys
+# (spc, ret, minus, etc.). What follows is the full US-layout printable
+# ASCII set. Everything outside this table is silently dropped — callers
+# are responsible for not feeding it characters the installer can't accept
+# anyway (passwords are validated to ASCII-printable in veilor-installer).
+declare -gA __KM_PLAIN=(
+    [' ']=spc       [a]=a [b]=b [c]=c [d]=d [e]=e [f]=f [g]=g [h]=h
+    [i]=i [j]=j [k]=k [l]=l [m]=m [n]=n [o]=o [p]=p [q]=q [r]=r
+    [s]=s [t]=t [u]=u [v]=v [w]=w [x]=x [y]=y [z]=z
+    [0]=0 [1]=1 [2]=2 [3]=3 [4]=4 [5]=5 [6]=6 [7]=7 [8]=8 [9]=9
+    ['-']=minus  ['=']=equal  ['[']=bracket_left  [']']=bracket_right
+    [';']=semicolon  ["'"]=apostrophe  [',']=comma  ['.']=dot
+    ['/']=slash  ['\\']=backslash  ['`']=grave_accent
+)
+
+# Shift-prefixed (US): all caps + shifted-symbol row.
+declare -gA __KM_SHIFT=(
+    [A]=a [B]=b [C]=c [D]=d [E]=e [F]=f [G]=g [H]=h [I]=i [J]=j
+    [K]=k [L]=l [M]=m [N]=n [O]=o [P]=p [Q]=q [R]=r [S]=s [T]=t
+    [U]=u [V]=v [W]=w [X]=x [Y]=y [Z]=z
+    ['!']=1  ['@']=2  ['#']=3  ['$']=4  ['%']=5
+    ['^']=6  ['&']=7  ['*']=8  ['(']=9  [')']=0
+    ['_']=minus  ['+']=equal  ['{']=bracket_left  ['}']=bracket_right
+    [':']=semicolon  ['"']=apostrophe  ['<']=comma  ['>']=dot
+    ['?']=slash  ['|']=backslash  ['~']=grave_accent
+)
+
+# ── Public send wrappers ───────────────────────────────────────────────
+# Send a single named key (e.g. ret, esc, up, tab, f1).
+km_send_key() {
+    local sock=$1 key=$2
+    km_monitor_send "$sock" "sendkey $key"
+}
+
+# Send a chord — components are joined with `-` per QEMU HMP syntax.
+km_send_chord() {
+    local sock=$1; shift
+    local IFS='-'
+    km_monitor_send "$sock" "sendkey $*"
+}
+
+# Type a string by encoding each character via the keymap. Unrecognised
+# characters are skipped with a warning to stderr — caller is expected to
+# stick to printable ASCII.
+km_send_str() {
+    local sock=$1 s=$2 ch chord
+    local i=0
+    while (( i < ${#s} )); do
+        ch="${s:i:1}"
+        if [[ -n "${__KM_PLAIN[$ch]:-}" ]]; then
+            chord="${__KM_PLAIN[$ch]}"
+            km_monitor_send "$sock" "sendkey $chord"
+        elif [[ -n "${__KM_SHIFT[$ch]:-}" ]]; then
+            chord="${__KM_SHIFT[$ch]}"
+            km_monitor_send "$sock" "sendkey shift-$chord"
+        else
+            printf '[WARN] km_send_str: unencodable char %q skipped\n' "$ch" >&2
+        fi
+        i=$((i + 1))
+        # Tiny gap so QEMU doesn't drop fast keypresses on busy hosts.
+        # Empirically 5ms = the line between "100% reliable" and "loses ~1%".
+        sleep 0.02
+    done
+}
+
+# Convenience: type a string then press Enter.
+km_send_line() {
+    local sock=$1 s=$2
+    km_send_str "$sock" "$s"
+    km_send_key "$sock" ret
+}
+
+# Visual indicator for log readability — prints a banner + a short pause so
+# the next monitor command has time to land on a stable UI frame. Used by
+# the harness between major steps; safe to skip in automated reuse.
+km_step_banner() {
+    local label=$1
+    printf '\n──── %s @ %s ────\n' "$label" "$(date +'%H:%M:%S')"
+}
diff --git a/test/auto-install.sh b/test/auto-install.sh
new file mode 100755
index 0000000..b77ecb2
--- /dev/null
+++ b/test/auto-install.sh
@@ -0,0 +1,654 @@
+#!/usr/bin/env bash
+# auto-install.sh — autonomous end-to-end install test for veilor-os.
+#
+# Boots a fresh ISO under QEMU, drives the gum installer via the QEMU
+# monitor (sendkey events), waits for anaconda to finish + reboot, SSHes
+# into the installed system, and runs a validation checklist.
+#
+# Usage:
+#   ./test/auto-install.sh path/to/veilor-os-*.iso
+#
+# Expected runtime:
+#   * boot + drive installer:   ~3 min
+#   * anaconda install (KDE):   ~15-25 min (depends on mirrors + host CPU)
+#   * reboot + SSH up:          ~2 min
+#   * validation checks:        <1 min
+#   * total:                    20-30 min wall clock
+#
+# Hardcoded test inputs (do NOT edit — meant to be deterministic):
+#   disk        first /dev/vda (only disk in QEMU)
+#   hostname    "veilor"  (installer hardcodes this in v0.5.4)
+#   LUKS pw     testpass1234
+#   admin pw    adminpass1234
+#   locale      en_GB.UTF-8 (first option, accepted with Enter)
+#
+# Outputs:
+#   /tmp/veilor-auto-install.log               — full driver log
+#   /tmp/veilor-auto-install-NN-<step>.png     — milestone screenshots
+#   /tmp/veilor-auto-install-final-ssh.txt     — final SSH session capture
+#
+# Exit codes:
+#   0 = all validation checks passed
+#   1 = any failure (anaconda crash, SSH never up, validation failed)
+#   2 = preflight failure (missing tool, bad ISO arg)
+#
+# This script intentionally does not source test/run-vm.sh — it needs a
+# different QEMU configuration (no live cloud-init seed since we're driving
+# the installed-system path), and run-vm.sh `exec`s qemu, which is
+# incompatible with running QEMU as a backgrounded child here.
+
+set -uo pipefail
+
+# ── Constants ──────────────────────────────────────────────────────────
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+TEST_DIR="$SCRIPT_DIR"
+DISK="$TEST_DIR/auto-install-vm.qcow2"
+NVRAM="$TEST_DIR/auto-install-vm.nvram"
+MONITOR_SOCK="$TEST_DIR/auto-install-vm.monitor.sock"
+SERIAL_LOG="$TEST_DIR/auto-install-vm.serial.log"
+SEED_ISO="$TEST_DIR/auto-install-seed.iso"
+
+LOG=/tmp/veilor-auto-install.log
+SHOT_PREFIX=/tmp/veilor-auto-install
+SSH_PORT=2222
+SSH_USER=admin
+
+LUKS_PW="testpass1234"
+ADMIN_PW="adminpass1234"
+
+# Disk: 40G is enough headroom — KDE base + 8G LUKS + LVM overhead fits in
+# ~12G actual, but qcow2 only allocates blocks that get touched.
+DISK_SIZE=40G
+
+# OVMF firmware paths — Fedora layout. Caller can override if needed.
+OVMF_CODE="${OVMF_CODE:-/usr/share/edk2/ovmf/OVMF_CODE.fd}"
+OVMF_VARS_SRC="${OVMF_VARS_SRC:-/usr/share/edk2/ovmf/OVMF_VARS.fd}"
+
+# Timing knobs — coarse but deliberate. Tighten only after observing slack
+# on a real run.
+WAIT_MONITOR_S=120        # qemu boot to monitor socket alive
+WAIT_INSTALLER_BANNER_S=180   # ISO boot → tty1 gum menu visible
+WAIT_GUM_PROMPT_S=8       # gum draws each prompt within ~5s
+WAIT_AFTER_INPUT_S=3      # let UI advance after we hit Enter
+ANACONDA_TIMEOUT_S=2700   # 45 min — anaconda + reboot + SSH come-up
+ANACONDA_POLL_S=30        # screenshot/poll cadence during install
+
+# ── Logging ────────────────────────────────────────────────────────────
+: > "$LOG"
+log() { printf '[%s] %s\n' "$(date +'%H:%M:%S')" "$*" | tee -a "$LOG"; }
+fail() { log "FAIL: $*"; exit 1; }
+
+# Source the keymap helper.
+# shellcheck source=auto-install-keymap.sh
+. "$SCRIPT_DIR/auto-install-keymap.sh"
+
+# ── Preflight ──────────────────────────────────────────────────────────
+preflight() {
+    log "preflight: checking environment"
+
+    ISO="${1:-}"
+    if [[ -z $ISO ]]; then
+        echo "Usage: $0 <path-to-veilor-os.iso>" >&2
+        exit 2
+    fi
+    if [[ ! -f $ISO ]]; then
+        echo "[ERR] ISO not found: $ISO" >&2
+        exit 2
+    fi
+
+    km_require_tools || exit 2
+    for t in ssh ssh-keygen pgrep pkill; do
+        command -v "$t" >/dev/null 2>&1 || { echo "[ERR] missing $t" >&2; exit 2; }
+    done
+
+    if [[ ! -f $OVMF_CODE ]]; then
+        echo "[ERR] OVMF firmware missing: $OVMF_CODE (install edk2-ovmf)" >&2
+        exit 2
+    fi
+
+    log "preflight: ISO=$ISO"
+}
+
+# ── VM lifecycle ───────────────────────────────────────────────────────
+
+# Kill any QEMU we previously started + scrub state files. Idempotent.
+kill_existing_vm() {
+    log "killing any existing auto-install QEMU"
+    if [[ -n "${QEMU_PID:-}" ]] && kill -0 "$QEMU_PID" 2>/dev/null; then
+        kill "$QEMU_PID" 2>/dev/null || true
+        sleep 2
+        kill -9 "$QEMU_PID" 2>/dev/null || true
+    fi
+    # Catch orphans from prior runs — match by disk path so we don't kill
+    # the user's other QEMU VMs.
+    pkill -f "qemu-system-x86_64.*$DISK" 2>/dev/null || true
+    rm -f "$MONITOR_SOCK" "$SERIAL_LOG"
+}
+
+# Wipe disk + nvram so each run is reproducible.
+wipe_state() {
+    log "wiping qcow2 + nvram"
+    rm -f "$DISK" "$NVRAM" "$SEED_ISO"
+    qemu-img create -f qcow2 "$DISK" "$DISK_SIZE" >/dev/null
+    cp "$OVMF_VARS_SRC" "$NVRAM"
+}
+
+# Build a NoCloud cloud-init seed ISO so anaconda's installed system picks
+# up our SSH pubkey on first boot. The installer-generated ks doesn't
+# explicitly invoke cloud-init, but Fedora ships cloud-init enabled by
+# default in @core; if a cidata seed is present at boot, NoCloud datasource
+# fires and we get key injection for free.
+build_seed_iso() {
+    local pubkey="" found=""
+    for cand in "$HOME/.ssh/id_ed25519.pub" "$HOME/.ssh/id_rsa.pub"; do
+        if [[ -f $cand ]]; then
+            pubkey="$(< "$cand")"
+            found=$cand
+            break
+        fi
+    done
+    if [[ -z $pubkey ]]; then
+        log "seed: no host SSH pubkey found at ~/.ssh/id_{ed25519,rsa}.pub"
+        log "seed: generating throwaway test key"
+        local key=$TEST_DIR/auto-install-id_ed25519
+        rm -f "$key" "$key.pub"
+        ssh-keygen -t ed25519 -N '' -f "$key" -C "veilor-auto-install" >/dev/null
+        pubkey="$(< "$key.pub")"
+        TEST_KEY="$key"
+    else
+        log "seed: using $found"
+        # Match host id; assume corresponding private key exists alongside.
+        TEST_KEY="${found%.pub}"
+    fi
+
+    local d
+    d=$(mktemp -d)
+    cat > "$d/meta-data" <<EOF
+instance-id: veilor-auto-install
+local-hostname: veilor
+EOF
+    cat > "$d/user-data" <<EOF
+#cloud-config
+users:
+  - name: admin
+    ssh_authorized_keys:
+      - $pubkey
+    lock_passwd: false
+ssh_pwauth: true
+runcmd:
+  - rm -f /etc/ssh/sshd_config.d/10-veilor-hardening.conf
+  - systemctl reload sshd || systemctl restart sshd || true
+EOF
+    if command -v mkisofs >/dev/null 2>&1; then
+        mkisofs -quiet -output "$SEED_ISO" -volid cidata -joliet -rock \
+            "$d/user-data" "$d/meta-data"
+    elif command -v xorriso >/dev/null 2>&1; then
+        xorriso -as mkisofs -quiet -output "$SEED_ISO" -volid cidata \
+            -joliet -rock "$d/user-data" "$d/meta-data"
+    else
+        log "seed: no mkisofs/xorriso — SSH key injection unavailable"
+        SEED_ISO=""
+    fi
+    rm -rf "$d"
+    [[ -f $SEED_ISO ]] && log "seed: built $SEED_ISO"
+}
+
+# Launch QEMU in the background. Returns once the monitor socket is alive.
+launch_vm() {
+    local iso=$1
+    log "launching QEMU"
+
+    local seed_args=()
+    [[ -n $SEED_ISO && -f $SEED_ISO ]] && \
+        seed_args=(-drive "file=$SEED_ISO,media=cdrom,readonly=on")
+
+    qemu-system-x86_64 \
+        -name veilor-auto-install \
+        -enable-kvm \
+        -cpu host \
+        -smp 4 \
+        -m 4096 \
+        -machine q35,smm=on \
+        -global driver=cfi.pflash01,property=secure,value=on \
+        -drive if=pflash,format=raw,readonly=on,file="$OVMF_CODE" \
+        -drive if=pflash,format=raw,file="$NVRAM" \
+        -drive file="$DISK",if=virtio,format=qcow2,cache=writeback \
+        -drive file="$iso",media=cdrom,readonly=on \
+        "${seed_args[@]}" \
+        -monitor "unix:$MONITOR_SOCK,server,nowait" \
+        -boot order=dc,menu=off \
+        -netdev user,id=net0,hostfwd=tcp::${SSH_PORT}-:22 \
+        -device virtio-net-pci,netdev=net0 \
+        -device virtio-rng-pci \
+        -vga virtio \
+        -display none \
+        -serial "file:$SERIAL_LOG" \
+        >>"$LOG" 2>&1 &
+    QEMU_PID=$!
+    log "QEMU pid=$QEMU_PID"
+
+    km_wait_socket "$MONITOR_SOCK" "$WAIT_MONITOR_S" \
+        || fail "monitor socket never opened"
+    log "monitor socket ready"
+}
+
+# Did QEMU die? Used at every poll; lets us bail with a useful message
+# instead of waiting out the full timeout.
+qemu_alive() {
+    [[ -n "${QEMU_PID:-}" ]] && kill -0 "$QEMU_PID" 2>/dev/null
+}
+
+# ── Driver: walk the installer flow ────────────────────────────────────
+
+# Take a numbered screenshot. Auto-increments NN.
+SHOT_N=0
+shot() {
+    local label=$1
+    SHOT_N=$((SHOT_N + 1))
+    local file
+    file=$(printf '%s-%02d-%s.png' "$SHOT_PREFIX" "$SHOT_N" "$label")
+    km_screendump "$MONITOR_SOCK" "$file"
+    log "screenshot: $file"
+}
+
+drive_installer() {
+    log "waiting ${WAIT_INSTALLER_BANNER_S}s for ISO boot + tty1 installer"
+
+    # The live ISO autologs into multi-user.target, runs gum on tty1 via a
+    # systemd unit that replaces getty (see overlay/etc/systemd/system/
+    # veilor-installer.service if it exists; otherwise via the multi-user
+    # default in kickstart line 250).
+    sleep "$WAIT_INSTALLER_BANNER_S"
+    qemu_alive || fail "QEMU died during ISO boot"
+    shot "boot-banner"
+
+    # Make absolutely sure we're on tty1 (the live ks sets multi-user.target
+    # default, so we should already be there — but a stray graphical.target
+    # on dev builds would silently swallow our keystrokes).
+    km_send_chord "$MONITOR_SOCK" ctrl alt f1
+    sleep "$WAIT_AFTER_INPUT_S"
+    shot "tty1"
+
+    # Step 1: top option = "Install" — gum choose has it pre-selected.
+    log "step: select Install"
+    km_send_key "$MONITOR_SOCK" ret
+    sleep "$WAIT_GUM_PROMPT_S"
+    shot "after-install-pick"
+
+    # Step 2: disk select — only /dev/vda exists in this QEMU. Default
+    # selection = first row.
+    log "step: select disk (/dev/vda — only one)"
+    km_send_key "$MONITOR_SOCK" ret
+    sleep "$WAIT_GUM_PROMPT_S"
+    shot "after-disk-pick"
+
+    # Step 3: LUKS passphrase. gum input --password reads stdin until newline.
+    log "step: enter LUKS passphrase"
+    km_send_str "$MONITOR_SOCK" "$LUKS_PW"
+    sleep 1
+    km_send_key "$MONITOR_SOCK" ret
+    sleep "$WAIT_AFTER_INPUT_S"
+    shot "after-luks-pw"
+
+    # Step 4: admin password.
+    log "step: enter admin password"
+    km_send_str "$MONITOR_SOCK" "$ADMIN_PW"
+    sleep 1
+    km_send_key "$MONITOR_SOCK" ret
+    sleep "$WAIT_AFTER_INPUT_S"
+    shot "after-admin-pw"
+
+    # Step 5: locale select — first option = en_GB.UTF-8.
+    log "step: confirm locale (en_GB.UTF-8)"
+    km_send_key "$MONITOR_SOCK" ret
+    sleep "$WAIT_GUM_PROMPT_S"
+    shot "after-locale"
+
+    # Step 6: confirm screen. gum confirm defaults to "Yes" focused →
+    # Enter accepts. (Verified against gum 0.13+ docs; if defaults change
+    # in a future gum, swap to explicit "y" via key map.)
+    log "step: confirm install"
+    km_send_key "$MONITOR_SOCK" ret
+    sleep "$WAIT_AFTER_INPUT_S"
+    shot "after-confirm"
+
+    log "installer driven: anaconda should now be running"
+}
+
+# Quick non-blocking SSH probe. Returns 0 if reachable.
+ssh_alive() {
+    ssh -p $SSH_PORT \
+        -o StrictHostKeyChecking=no \
+        -o UserKnownHostsFile=/dev/null \
+        -o ConnectTimeout=3 \
+        -o BatchMode=yes \
+        ${TEST_KEY:+-i "$TEST_KEY"} \
+        "$SSH_USER@127.0.0.1" true 2>/dev/null
+}
+
+# Poll for anaconda completion + SSH availability. We can't watch QEMU exit
+# (anaconda's `reboot` directive triggers systemctl reboot, which doesn't
+# poweroff the VM — it boots back into the installed disk). The signal we
+# actually trust is SSH on port 2222 starting to answer.
+#
+# If cloud-init didn't run (the seed ISO might not have been picked up by
+# anaconda's installed system, depending on whether /etc/cloud is in the
+# installed package set), SSH will never come up via key auth. The fallback
+# in tty1_unlock_ssh() drives the SDDM/console login by hand.
+wait_for_install_and_reboot() {
+    log "waiting up to ${ANACONDA_TIMEOUT_S}s for anaconda + reboot + SSH"
+
+    local waited=0 last_shot=0 last_ppm_hash="" same_count=0
+    while (( waited < ANACONDA_TIMEOUT_S )); do
+        if ! qemu_alive; then
+            fail "QEMU exited unexpectedly during install (check $SERIAL_LOG)"
+        fi
+
+        # SSH probe — first PASS exits the loop.
+        if ssh_alive; then
+            log "SSH up — installed system reachable"
+            shot "ssh-up"
+            return 0
+        fi
+
+        # Periodic screenshot + stuck-screen detection.
+        if (( waited - last_shot >= ANACONDA_POLL_S )); then
+            local ppm="$SHOT_PREFIX-poll.ppm"
+            km_monitor_send "$MONITOR_SOCK" "screendump $ppm"
+            sleep 1
+            if [[ -f $ppm ]]; then
+                local h
+                h=$(sha256sum "$ppm" 2>/dev/null | cut -d' ' -f1)
+                if [[ -n $last_ppm_hash && $h == "$last_ppm_hash" ]]; then
+                    same_count=$((same_count + 1))
+                else
+                    same_count=0
+                fi
+                last_ppm_hash=$h
+                rm -f "$ppm"
+            fi
+            # 5 minutes of identical frames = stuck. Anaconda's text-mode
+            # progress refreshes at least every minute, so 10 frames in a
+            # row (5 min @ 30s cadence) identical means it's wedged.
+            if (( same_count >= 10 )); then
+                shot "stuck"
+                fail "screen unchanged for 5min — anaconda likely crashed"
+            fi
+            last_shot=$waited
+            log "anaconda still running... (${waited}s elapsed)"
+        fi
+
+        sleep 5
+        waited=$((waited + 5))
+    done
+
+    shot "ssh-timeout"
+    log "SSH never came up via cloud-init; trying TTY1 fallback"
+    if tty1_unlock_ssh; then
+        log "TTY1 fallback succeeded; SSH should be reachable"
+        return 0
+    fi
+    fail "anaconda did not complete + SSH within ${ANACONDA_TIMEOUT_S}s, TTY1 fallback also failed"
+}
+
+# TTY1 fallback: the installed system reached SDDM (graphical) or got stuck
+# at LUKS prompt. We drop to a TTY, log in as admin (chage forces password
+# change on first use), and undo the sshd hardening so our pubkey works.
+#
+# This is best-effort. If the LUKS prompt is still up — we can't get past
+# it without typing the passphrase, which we do here too.
+tty1_unlock_ssh() {
+    log "TTY1 fallback: typing LUKS passphrase + admin login + opening sshd"
+
+    # Switch to tty1 in case SDDM grabbed graphical.
+    km_send_chord "$MONITOR_SOCK" ctrl alt f3
+    sleep 3
+
+    # If we're at LUKS prompt, the passphrase clears it. If we're already
+    # past LUKS, this is a harmless garbage on the login prompt — we Enter
+    # to clear, then proceed with login.
+    km_send_str "$MONITOR_SOCK" "$LUKS_PW"
+    km_send_key "$MONITOR_SOCK" ret
+    sleep 30  # cryptsetup unlock + boot to login prompt
+
+    shot "tty3-prelogin"
+
+    # Username — admin. chage -d 0 means we'll be prompted to change pw on
+    # first login. The old password is whatever we typed at install time;
+    # the new password just has to satisfy PAM minlen — reuse $ADMIN_PW
+    # and add a "1" suffix to make passwd's "must differ" check happy.
+    km_send_line "$MONITOR_SOCK" "admin"
+    sleep 3
+    km_send_line "$MONITOR_SOCK" "$ADMIN_PW"
+    sleep 5
+    # Old pw prompt (chage forced).
+    km_send_line "$MONITOR_SOCK" "$ADMIN_PW"
+    sleep 2
+    # New pw twice. Use a derivative; PAM rejects identical-to-old and we
+    # don't want to surprise the user with a password change.
+    km_send_line "$MONITOR_SOCK" "${ADMIN_PW}new"
+    sleep 1
+    km_send_line "$MONITOR_SOCK" "${ADMIN_PW}new"
+    sleep 5
+
+    shot "tty3-loggedin"
+
+    # Inject host pubkey + remove sshd hardening + reload sshd.
+    local pubkey=""
+    if [[ -n "${TEST_KEY:-}" && -f "${TEST_KEY}.pub" ]]; then
+        pubkey=$(< "${TEST_KEY}.pub")
+    fi
+    if [[ -z $pubkey ]]; then
+        log "TTY1 fallback: no pubkey to inject — cannot recover SSH"
+        return 1
+    fi
+
+    km_send_line "$MONITOR_SOCK" "mkdir -p ~/.ssh && chmod 700 ~/.ssh"
+    sleep 1
+    km_send_line "$MONITOR_SOCK" "echo '$pubkey' >> ~/.ssh/authorized_keys"
+    sleep 1
+    km_send_line "$MONITOR_SOCK" "chmod 600 ~/.ssh/authorized_keys"
+    sleep 1
+    km_send_line "$MONITOR_SOCK" "echo '${ADMIN_PW}new' | sudo -S rm -f /etc/ssh/sshd_config.d/10-veilor-hardening.conf"
+    sleep 2
+    km_send_line "$MONITOR_SOCK" "echo '${ADMIN_PW}new' | sudo -S systemctl reload sshd"
+    sleep 5
+
+    # Wait up to 60s for SSH to actually answer.
+    local i
+    for ((i=0; i<60; i++)); do
+        if ssh_alive; then
+            log "TTY1 fallback: SSH reachable after ${i}s"
+            return 0
+        fi
+        sleep 1
+    done
+    return 1
+}
+
+# ── Validation ─────────────────────────────────────────────────────────
+# Run a single SSH command, return its stdout. Failures are NOT fatal here
+# — caller decides what's a hard failure.
+remote() {
+    ssh -p $SSH_PORT \
+        -o StrictHostKeyChecking=no \
+        -o UserKnownHostsFile=/dev/null \
+        -o BatchMode=yes \
+        ${TEST_KEY:+-i "$TEST_KEY"} \
+        "$SSH_USER@127.0.0.1" "$@"
+}
+
+# Validation result accumulator. check_remote runs a shell snippet on the
+# installed VM via SSH; the snippet must exit 0 for PASS, non-zero for
+# FAIL. check_eq compares remote stdout to an expected literal.
+VALIDATIONS=()
+
+# check_remote <desc> <remote shell snippet>
+# Runs the snippet via SSH, treats exit code as the verdict.
+check_remote() {
+    local desc=$1 cmd=$2
+    local out rc
+    out=$(remote "$cmd" 2>&1)
+    rc=$?
+    if (( rc == 0 )); then
+        VALIDATIONS+=("PASS  $desc")
+        log "  PASS: $desc"
+    else
+        # Truncate the failure context so the report stays scannable.
+        local trimmed=${out:0:120}
+        VALIDATIONS+=("FAIL  $desc  ($trimmed)")
+        log "  FAIL: $desc -- $trimmed"
+    fi
+}
+
+# check_eq <desc> <remote shell snippet> <expected stdout>
+# Runs the snippet, trims trailing whitespace, compares to expected.
+check_eq() {
+    local desc=$1 cmd=$2 expected=$3
+    local got
+    got=$(remote "$cmd" 2>/dev/null | tr -d '\r' | tail -n1)
+    got=${got%[[:space:]]}
+    if [[ $got == "$expected" ]]; then
+        VALIDATIONS+=("PASS  $desc (=$got)")
+        log "  PASS: $desc (=$got)"
+    else
+        VALIDATIONS+=("FAIL  $desc  (got: '$got', expected: '$expected')")
+        log "  FAIL: $desc -- got '$got' expected '$expected'"
+    fi
+}
+
+run_validation() {
+    log "running validation checklist"
+
+    # os-release
+    check_remote "/etc/os-release: NAME=veilor-os" \
+        'grep -q "^NAME=.veilor-os" /etc/os-release'
+
+    check_eq "hostnamectl --static = veilor" \
+        'hostnamectl --static' "veilor"
+
+    # Active services
+    for svc in sshd fail2ban usbguard tuned auditd firewalld chronyd sddm; do
+        check_eq "$svc is-active" \
+            "systemctl is-active $svc" "active"
+    done
+
+    # SELinux. v0.5.x kickstart sets `selinux --enforcing` for installed
+    # systems but veilor-firstboot may toggle behavior — accept either
+    # Enforcing or Permissive, but log which one we got. (Hard-fail on
+    # Disabled.)
+    local selinux
+    selinux=$(remote getenforce 2>/dev/null | tr -d '\r' | tail -n1)
+    selinux=${selinux%[[:space:]]}
+    if [[ $selinux == Enforcing ]]; then
+        VALIDATIONS+=("PASS  SELinux = Enforcing")
+        log "  PASS: SELinux = Enforcing"
+    elif [[ $selinux == Permissive ]]; then
+        VALIDATIONS+=("PASS  SELinux = Permissive (acceptable for v0.5)")
+        log "  PASS (soft): SELinux = Permissive"
+    else
+        VALIDATIONS+=("FAIL  SELinux = $selinux")
+        log "  FAIL: SELinux = $selinux"
+    fi
+
+    # Disk layout: LUKS2 + btrfs.
+    check_remote "lsblk shows crypto_LUKS" \
+        'lsblk -f | grep -q crypto_LUKS'
+    check_remote "lsblk shows btrfs" \
+        'lsblk -f | grep -q btrfs'
+    check_remote "/etc/crypttab has LUKS entry" \
+        'grep -Ev "^\s*(#|$)" /etc/crypttab | grep -qi luks'
+
+    # Admin user
+    check_remote "admin user exists" \
+        'getent passwd admin | grep -q "^admin:"'
+
+    # CLI tools shipped via overlay.
+    for bin in veilor-power veilor-doctor veilor-update; do
+        check_remote "/usr/local/bin/$bin present" \
+            "test -x /usr/local/bin/$bin"
+    done
+
+    # init_on_alloc — veilor-installer kickstart sets it on the install
+    # cmdline (line 315). /proc/cmdline is the source of truth.
+    check_remote "init_on_alloc=1 in /proc/cmdline" \
+        'grep -q init_on_alloc=1 /proc/cmdline'
+}
+
+# ── Reporting ──────────────────────────────────────────────────────────
+print_report() {
+    local pass=0 fail=0
+    for line in "${VALIDATIONS[@]}"; do
+        case "$line" in
+            PASS*) pass=$((pass + 1)) ;;
+            FAIL*) fail=$((fail + 1)) ;;
+        esac
+    done
+
+    {
+        echo "════════════════════════════════════════════════════════"
+        echo " veilor-os auto-install test report"
+        echo " $(date)"
+        echo "════════════════════════════════════════════════════════"
+        printf '%s\n' "${VALIDATIONS[@]}"
+        echo "────────────────────────────────────────────────────────"
+        printf 'TOTAL: %d PASS, %d FAIL\n' "$pass" "$fail"
+        echo "Logs:        $LOG"
+        echo "Screenshots: ${SHOT_PREFIX}-NN-*.png"
+        echo "Serial log:  $SERIAL_LOG"
+        echo "════════════════════════════════════════════════════════"
+    } | tee -a "$LOG"
+
+    # Capture a final SSH session snapshot (uname/lsblk/sysctl) for the
+    # human reviewer.
+    {
+        echo "=== final ssh probe ==="
+        date
+        echo "--- uname -a ---"
+        remote uname -a 2>&1
+        echo "--- lsblk -f ---"
+        remote lsblk -f 2>&1
+        echo "--- /proc/cmdline ---"
+        remote cat /proc/cmdline 2>&1
+        echo "--- systemctl --failed ---"
+        remote systemctl --failed 2>&1
+    } > "${SHOT_PREFIX}-final-ssh.txt" 2>&1 || true
+    log "final ssh snapshot: ${SHOT_PREFIX}-final-ssh.txt"
+
+    if (( fail > 0 )); then
+        return 1
+    fi
+    return 0
+}
+
+cleanup() {
+    log "cleanup"
+    if [[ -n "${QEMU_PID:-}" ]] && kill -0 "$QEMU_PID" 2>/dev/null; then
+        # Graceful shutdown via monitor first; SIGTERM if it ignores us.
+        km_monitor_send "$MONITOR_SOCK" "system_powerdown" 2>/dev/null || true
+        sleep 5
+        if kill -0 "$QEMU_PID" 2>/dev/null; then
+            kill "$QEMU_PID" 2>/dev/null || true
+            sleep 2
+            kill -9 "$QEMU_PID" 2>/dev/null || true
+        fi
+    fi
+    rm -f "$MONITOR_SOCK"
+}
+
+# ── Main ───────────────────────────────────────────────────────────────
+main() {
+    trap cleanup EXIT
+
+    preflight "$@"
+    kill_existing_vm
+    wipe_state
+    build_seed_iso
+    launch_vm "$ISO"
+    drive_installer
+    wait_for_install_and_reboot
+    run_validation
+    print_report
+}
+
+main "$@"