From be761d57fcd6ba2af0ebb5c4e90a2278d3562177 Mon Sep 17 00:00:00 2001 From: veilor-org Date: Sat, 2 May 2026 21:24:59 +0100 Subject: [PATCH] v0.5.5: autonomous install test harness test/auto-install.sh boots ISO, drives gum installer via QEMU monitor sendkey with hardcoded test answers, waits for anaconda, reboots into installed system, SSHs in, runs validation checklist. --- test/README.md | 66 ++++ test/auto-install-keymap.sh | 167 +++++++++ test/auto-install.sh | 654 ++++++++++++++++++++++++++++++++++++ 3 files changed, 887 insertions(+) create mode 100644 test/README.md create mode 100755 test/auto-install-keymap.sh create mode 100755 test/auto-install.sh diff --git a/test/README.md b/test/README.md new file mode 100644 index 0000000..e322dc5 --- /dev/null +++ b/test/README.md @@ -0,0 +1,66 @@ +# test/ + +Test harnesses for veilor-os ISO builds. + +## Files + +| File | Purpose | +|------|---------| +| `run-vm.sh` | Manual smoke test — boot the latest ISO interactively in QEMU/KVM. SSH key injection via cloud-init seed + monitor sendkey fallback for live-image login. | +| `auto-install.sh` | **Autonomous** end-to-end install test. Boots ISO, drives the gum installer via QEMU monitor `sendkey`, waits for anaconda to finish + reboot, SSHs into the installed system, runs validation checklist. Prints PASS/FAIL summary. | +| `auto-install-keymap.sh` | Sourced helper. Provides `km_send_str`, `km_send_chord`, `km_send_key`, `km_screendump`, `km_wait_socket`, etc. Reusable by other automation. | +| `boot-checklist.md` | Manual post-install checklist (run on a real spare laptop). | + +## Running the autonomous installer test + +```sh +./test/auto-install.sh build/out/veilor-os-*.iso +``` + +Hardcoded inputs (deterministic — do not edit during a test run): +- Disk: first `/dev/vda` (the only disk in QEMU) +- Hostname: `veilor` (installer hardcoded since v0.5.4) +- LUKS passphrase: `testpass1234` +- Admin password: `adminpass1234` +- Locale: `en_GB.UTF-8` + +Expected runtime: 20–30 minutes wall clock (anaconda dominates). + +### Outputs + +- `/tmp/veilor-auto-install.log` — full driver log +- `/tmp/veilor-auto-install-NN-.png` — milestone screenshots +- `/tmp/veilor-auto-install-final-ssh.txt` — final SSH session capture (uname/lsblk/cmdline/failed units) + +### Exit codes + +- `0` — all validation checks passed +- `1` — any failure (anaconda crashed, SSH never came up, validation check failed) +- `2` — preflight failure (missing tool, bad ISO arg, missing OVMF) + +### Prerequisites + +- `qemu-system-x86_64`, `qemu-img`, `socat`, `ssh`, `ssh-keygen` +- `edk2-ovmf` (OVMF UEFI firmware at `/usr/share/edk2/ovmf/OVMF_{CODE,VARS}.fd`) +- `mkisofs` or `xorriso` (for cloud-init seed ISO; harness falls back to TTY1 driving if seed cannot be built or cloud-init does not run on the installed system) +- `convert` from ImageMagick (optional — converts PPM screendumps to PNG; harness keeps PPM if absent) +- KVM access (`/dev/kvm` readable by the user) + +### What it validates + +Post-install on the booted system: +- `/etc/os-release` → `NAME=veilor-os` +- `hostnamectl --static` → `veilor` +- `systemctl is-active` → `active` for `sshd fail2ban usbguard tuned auditd firewalld chronyd sddm` +- `getenforce` → `Enforcing` (preferred) or `Permissive` (acceptable for v0.5.x) +- `lsblk -f` shows `crypto_LUKS` + `btrfs` +- `/etc/crypttab` has a LUKS entry +- `getent passwd admin` returns the user +- `/usr/local/bin/{veilor-power,veilor-doctor,veilor-update}` are present and executable +- `/proc/cmdline` contains `init_on_alloc=1` + +### Troubleshooting + +- **Stuck at boot banner**: ISO didn't autostart `veilor-installer` on tty1. Check `serial.log` and `auto-install-vm-NN-*.png` screenshots. The harness aborts after 5 minutes of identical screen frames. +- **SSH never up**: cloud-init may not have run on the installed system (no `cidata` mount). The harness falls back to TTY1 driving — typing the LUKS passphrase, logging in as admin, and hand-injecting the SSH key. If both paths fail, validation cannot proceed. +- **`screendump` produces unreadable PPM**: install ImageMagick (`dnf install ImageMagick`) so the harness converts to PNG. diff --git a/test/auto-install-keymap.sh b/test/auto-install-keymap.sh new file mode 100755 index 0000000..56fdebd --- /dev/null +++ b/test/auto-install-keymap.sh @@ -0,0 +1,167 @@ +#!/usr/bin/env bash +# auto-install-keymap.sh — sourced helper for QEMU-monitor-driven UI automation. +# +# Provides a minimal but complete US-layout keymap mapping every printable +# ASCII character to a QEMU `sendkey` chord, plus convenience wrappers for +# typing strings, sending special keys, taking screenshots, and waiting for +# the monitor socket to appear. +# +# Usage: +# source test/auto-install-keymap.sh +# MONITOR_SOCK=/path/to/sock +# km_wait_socket "$MONITOR_SOCK" 60 +# km_send_str "$MONITOR_SOCK" "hello world" +# km_send_key "$MONITOR_SOCK" ret +# km_send_chord "$MONITOR_SOCK" ctrl alt f1 +# km_screendump "$MONITOR_SOCK" /tmp/shot.ppm +# +# Why a separate file: other harnesses (regression suites, fuzzers) can +# source this without dragging in the full installer test driver. + +# Guard against double-source. +[[ -n "${__VEILOR_KEYMAP_LOADED:-}" ]] && return 0 +__VEILOR_KEYMAP_LOADED=1 + +# ── Tool requirements ────────────────────────────────────────────────── +# socat is the canonical way to talk to a unix-domain QEMU monitor. +# nc-openbsd would also work but socat is what run-vm.sh already uses. +km_require_tools() { + local missing=() + for t in socat qemu-img qemu-system-x86_64; do + command -v "$t" >/dev/null 2>&1 || missing+=("$t") + done + if [[ ${#missing[@]} -gt 0 ]]; then + echo "[ERR] missing required tools: ${missing[*]}" >&2 + return 1 + fi +} + +# ── Low-level monitor I/O ────────────────────────────────────────────── +# Send a single line of monitor input. Newlines are critical — QEMU's HMP +# parses one command per line. Errors are swallowed: the most common cause +# is the VM having shut down between two send_* calls, which we tolerate. +km_monitor_send() { + local sock=$1; shift + printf '%s\n' "$*" | socat - "UNIX-CONNECT:$sock" 2>/dev/null || true +} + +# Send a raw HMP command and capture any stdout response (e.g. for `info` +# queries). Trims the QEMU monitor banner + prompt noise. +km_monitor_query() { + local sock=$1; shift + printf '%s\n' "$*" | socat -t 1 - "UNIX-CONNECT:$sock" 2>/dev/null \ + | sed -e 's/\r//g' -e '/^QEMU /d' -e '/^(qemu)/d' || true +} + +# Wait until the monitor unix socket exists and accepts connections. +# $2 = max wait in seconds (default 60). +km_wait_socket() { + local sock=$1 max=${2:-60} waited=0 + while (( waited < max )); do + if [[ -S $sock ]]; then + # Try a no-op query — confirms the QEMU side is actually serving. + if printf 'info status\n' | socat -t 1 - "UNIX-CONNECT:$sock" >/dev/null 2>&1; then + return 0 + fi + fi + sleep 1 + waited=$((waited + 1)) + done + echo "[ERR] monitor socket $sock never became ready (waited ${max}s)" >&2 + return 1 +} + +# ── Screenshots ──────────────────────────────────────────────────────── +# Ask QEMU to dump the current framebuffer. Output is PPM. Convert to PNG +# with ImageMagick if available; otherwise leave PPM and warn. +km_screendump() { + local sock=$1 out=$2 + local ppm="${out%.png}.ppm" + km_monitor_send "$sock" "screendump $ppm" + sleep 1 # give QEMU a moment to flush + if [[ -f $ppm ]] && command -v convert >/dev/null 2>&1; then + convert "$ppm" "$out" 2>/dev/null && rm -f "$ppm" + fi +} + +# ── Key tables ───────────────────────────────────────────────────────── +# QEMU `sendkey` reference: docs/system/keys.html.in. The HMP names are +# the X11 keysym lower-case, with a few exceptions for non-letter keys +# (spc, ret, minus, etc.). What follows is the full US-layout printable +# ASCII set. Everything outside this table is silently dropped — callers +# are responsible for not feeding it characters the installer can't accept +# anyway (passwords are validated to ASCII-printable in veilor-installer). +declare -gA __KM_PLAIN=( + [' ']=spc [a]=a [b]=b [c]=c [d]=d [e]=e [f]=f [g]=g [h]=h + [i]=i [j]=j [k]=k [l]=l [m]=m [n]=n [o]=o [p]=p [q]=q [r]=r + [s]=s [t]=t [u]=u [v]=v [w]=w [x]=x [y]=y [z]=z + [0]=0 [1]=1 [2]=2 [3]=3 [4]=4 [5]=5 [6]=6 [7]=7 [8]=8 [9]=9 + ['-']=minus ['=']=equal ['[']=bracket_left [']']=bracket_right + [';']=semicolon ["'"]=apostrophe [',']=comma ['.']=dot + ['/']=slash ['\\']=backslash ['`']=grave_accent +) + +# Shift-prefixed (US): all caps + shifted-symbol row. +declare -gA __KM_SHIFT=( + [A]=a [B]=b [C]=c [D]=d [E]=e [F]=f [G]=g [H]=h [I]=i [J]=j + [K]=k [L]=l [M]=m [N]=n [O]=o [P]=p [Q]=q [R]=r [S]=s [T]=t + [U]=u [V]=v [W]=w [X]=x [Y]=y [Z]=z + ['!']=1 ['@']=2 ['#']=3 ['$']=4 ['%']=5 + ['^']=6 ['&']=7 ['*']=8 ['(']=9 [')']=0 + ['_']=minus ['+']=equal ['{']=bracket_left ['}']=bracket_right + [':']=semicolon ['"']=apostrophe ['<']=comma ['>']=dot + ['?']=slash ['|']=backslash ['~']=grave_accent +) + +# ── Public send wrappers ─────────────────────────────────────────────── +# Send a single named key (e.g. ret, esc, up, tab, f1). +km_send_key() { + local sock=$1 key=$2 + km_monitor_send "$sock" "sendkey $key" +} + +# Send a chord — components are joined with `-` per QEMU HMP syntax. +km_send_chord() { + local sock=$1; shift + local IFS='-' + km_monitor_send "$sock" "sendkey $*" +} + +# Type a string by encoding each character via the keymap. Unrecognised +# characters are skipped with a warning to stderr — caller is expected to +# stick to printable ASCII. +km_send_str() { + local sock=$1 s=$2 ch chord + local i=0 + while (( i < ${#s} )); do + ch="${s:i:1}" + if [[ -n "${__KM_PLAIN[$ch]:-}" ]]; then + chord="${__KM_PLAIN[$ch]}" + km_monitor_send "$sock" "sendkey $chord" + elif [[ -n "${__KM_SHIFT[$ch]:-}" ]]; then + chord="${__KM_SHIFT[$ch]}" + km_monitor_send "$sock" "sendkey shift-$chord" + else + printf '[WARN] km_send_str: unencodable char %q skipped\n' "$ch" >&2 + fi + i=$((i + 1)) + # Tiny gap so QEMU doesn't drop fast keypresses on busy hosts. + # Empirically 5ms = the line between "100% reliable" and "loses ~1%". + sleep 0.02 + done +} + +# Convenience: type a string then press Enter. +km_send_line() { + local sock=$1 s=$2 + km_send_str "$sock" "$s" + km_send_key "$sock" ret +} + +# Visual indicator for log readability — prints a banner + a short pause so +# the next monitor command has time to land on a stable UI frame. Used by +# the harness between major steps; safe to skip in automated reuse. +km_step_banner() { + local label=$1 + printf '\n──── %s @ %s ────\n' "$label" "$(date +'%H:%M:%S')" +} diff --git a/test/auto-install.sh b/test/auto-install.sh new file mode 100755 index 0000000..b77ecb2 --- /dev/null +++ b/test/auto-install.sh @@ -0,0 +1,654 @@ +#!/usr/bin/env bash +# auto-install.sh — autonomous end-to-end install test for veilor-os. +# +# Boots a fresh ISO under QEMU, drives the gum installer via the QEMU +# monitor (sendkey events), waits for anaconda to finish + reboot, SSHes +# into the installed system, and runs a validation checklist. +# +# Usage: +# ./test/auto-install.sh path/to/veilor-os-*.iso +# +# Expected runtime: +# * boot + drive installer: ~3 min +# * anaconda install (KDE): ~15-25 min (depends on mirrors + host CPU) +# * reboot + SSH up: ~2 min +# * validation checks: <1 min +# * total: 20-30 min wall clock +# +# Hardcoded test inputs (do NOT edit — meant to be deterministic): +# disk first /dev/vda (only disk in QEMU) +# hostname "veilor" (installer hardcodes this in v0.5.4) +# LUKS pw testpass1234 +# admin pw adminpass1234 +# locale en_GB.UTF-8 (first option, accepted with Enter) +# +# Outputs: +# /tmp/veilor-auto-install.log — full driver log +# /tmp/veilor-auto-install-NN-.png — milestone screenshots +# /tmp/veilor-auto-install-final-ssh.txt — final SSH session capture +# +# Exit codes: +# 0 = all validation checks passed +# 1 = any failure (anaconda crash, SSH never up, validation failed) +# 2 = preflight failure (missing tool, bad ISO arg) +# +# This script intentionally does not source test/run-vm.sh — it needs a +# different QEMU configuration (no live cloud-init seed since we're driving +# the installed-system path), and run-vm.sh `exec`s qemu, which is +# incompatible with running QEMU as a backgrounded child here. + +set -uo pipefail + +# ── Constants ────────────────────────────────────────────────────────── +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +TEST_DIR="$SCRIPT_DIR" +DISK="$TEST_DIR/auto-install-vm.qcow2" +NVRAM="$TEST_DIR/auto-install-vm.nvram" +MONITOR_SOCK="$TEST_DIR/auto-install-vm.monitor.sock" +SERIAL_LOG="$TEST_DIR/auto-install-vm.serial.log" +SEED_ISO="$TEST_DIR/auto-install-seed.iso" + +LOG=/tmp/veilor-auto-install.log +SHOT_PREFIX=/tmp/veilor-auto-install +SSH_PORT=2222 +SSH_USER=admin + +LUKS_PW="testpass1234" +ADMIN_PW="adminpass1234" + +# Disk: 40G is enough headroom — KDE base + 8G LUKS + LVM overhead fits in +# ~12G actual, but qcow2 only allocates blocks that get touched. +DISK_SIZE=40G + +# OVMF firmware paths — Fedora layout. Caller can override if needed. +OVMF_CODE="${OVMF_CODE:-/usr/share/edk2/ovmf/OVMF_CODE.fd}" +OVMF_VARS_SRC="${OVMF_VARS_SRC:-/usr/share/edk2/ovmf/OVMF_VARS.fd}" + +# Timing knobs — coarse but deliberate. Tighten only after observing slack +# on a real run. +WAIT_MONITOR_S=120 # qemu boot to monitor socket alive +WAIT_INSTALLER_BANNER_S=180 # ISO boot → tty1 gum menu visible +WAIT_GUM_PROMPT_S=8 # gum draws each prompt within ~5s +WAIT_AFTER_INPUT_S=3 # let UI advance after we hit Enter +ANACONDA_TIMEOUT_S=2700 # 45 min — anaconda + reboot + SSH come-up +ANACONDA_POLL_S=30 # screenshot/poll cadence during install + +# ── Logging ──────────────────────────────────────────────────────────── +: > "$LOG" +log() { printf '[%s] %s\n' "$(date +'%H:%M:%S')" "$*" | tee -a "$LOG"; } +fail() { log "FAIL: $*"; exit 1; } + +# Source the keymap helper. +# shellcheck source=auto-install-keymap.sh +. "$SCRIPT_DIR/auto-install-keymap.sh" + +# ── Preflight ────────────────────────────────────────────────────────── +preflight() { + log "preflight: checking environment" + + ISO="${1:-}" + if [[ -z $ISO ]]; then + echo "Usage: $0 " >&2 + exit 2 + fi + if [[ ! -f $ISO ]]; then + echo "[ERR] ISO not found: $ISO" >&2 + exit 2 + fi + + km_require_tools || exit 2 + for t in ssh ssh-keygen pgrep pkill; do + command -v "$t" >/dev/null 2>&1 || { echo "[ERR] missing $t" >&2; exit 2; } + done + + if [[ ! -f $OVMF_CODE ]]; then + echo "[ERR] OVMF firmware missing: $OVMF_CODE (install edk2-ovmf)" >&2 + exit 2 + fi + + log "preflight: ISO=$ISO" +} + +# ── VM lifecycle ─────────────────────────────────────────────────────── + +# Kill any QEMU we previously started + scrub state files. Idempotent. +kill_existing_vm() { + log "killing any existing auto-install QEMU" + if [[ -n "${QEMU_PID:-}" ]] && kill -0 "$QEMU_PID" 2>/dev/null; then + kill "$QEMU_PID" 2>/dev/null || true + sleep 2 + kill -9 "$QEMU_PID" 2>/dev/null || true + fi + # Catch orphans from prior runs — match by disk path so we don't kill + # the user's other QEMU VMs. + pkill -f "qemu-system-x86_64.*$DISK" 2>/dev/null || true + rm -f "$MONITOR_SOCK" "$SERIAL_LOG" +} + +# Wipe disk + nvram so each run is reproducible. +wipe_state() { + log "wiping qcow2 + nvram" + rm -f "$DISK" "$NVRAM" "$SEED_ISO" + qemu-img create -f qcow2 "$DISK" "$DISK_SIZE" >/dev/null + cp "$OVMF_VARS_SRC" "$NVRAM" +} + +# Build a NoCloud cloud-init seed ISO so anaconda's installed system picks +# up our SSH pubkey on first boot. The installer-generated ks doesn't +# explicitly invoke cloud-init, but Fedora ships cloud-init enabled by +# default in @core; if a cidata seed is present at boot, NoCloud datasource +# fires and we get key injection for free. +build_seed_iso() { + local pubkey="" found="" + for cand in "$HOME/.ssh/id_ed25519.pub" "$HOME/.ssh/id_rsa.pub"; do + if [[ -f $cand ]]; then + pubkey="$(< "$cand")" + found=$cand + break + fi + done + if [[ -z $pubkey ]]; then + log "seed: no host SSH pubkey found at ~/.ssh/id_{ed25519,rsa}.pub" + log "seed: generating throwaway test key" + local key=$TEST_DIR/auto-install-id_ed25519 + rm -f "$key" "$key.pub" + ssh-keygen -t ed25519 -N '' -f "$key" -C "veilor-auto-install" >/dev/null + pubkey="$(< "$key.pub")" + TEST_KEY="$key" + else + log "seed: using $found" + # Match host id; assume corresponding private key exists alongside. + TEST_KEY="${found%.pub}" + fi + + local d + d=$(mktemp -d) + cat > "$d/meta-data" < "$d/user-data" </dev/null 2>&1; then + mkisofs -quiet -output "$SEED_ISO" -volid cidata -joliet -rock \ + "$d/user-data" "$d/meta-data" + elif command -v xorriso >/dev/null 2>&1; then + xorriso -as mkisofs -quiet -output "$SEED_ISO" -volid cidata \ + -joliet -rock "$d/user-data" "$d/meta-data" + else + log "seed: no mkisofs/xorriso — SSH key injection unavailable" + SEED_ISO="" + fi + rm -rf "$d" + [[ -f $SEED_ISO ]] && log "seed: built $SEED_ISO" +} + +# Launch QEMU in the background. Returns once the monitor socket is alive. +launch_vm() { + local iso=$1 + log "launching QEMU" + + local seed_args=() + [[ -n $SEED_ISO && -f $SEED_ISO ]] && \ + seed_args=(-drive "file=$SEED_ISO,media=cdrom,readonly=on") + + qemu-system-x86_64 \ + -name veilor-auto-install \ + -enable-kvm \ + -cpu host \ + -smp 4 \ + -m 4096 \ + -machine q35,smm=on \ + -global driver=cfi.pflash01,property=secure,value=on \ + -drive if=pflash,format=raw,readonly=on,file="$OVMF_CODE" \ + -drive if=pflash,format=raw,file="$NVRAM" \ + -drive file="$DISK",if=virtio,format=qcow2,cache=writeback \ + -drive file="$iso",media=cdrom,readonly=on \ + "${seed_args[@]}" \ + -monitor "unix:$MONITOR_SOCK,server,nowait" \ + -boot order=dc,menu=off \ + -netdev user,id=net0,hostfwd=tcp::${SSH_PORT}-:22 \ + -device virtio-net-pci,netdev=net0 \ + -device virtio-rng-pci \ + -vga virtio \ + -display none \ + -serial "file:$SERIAL_LOG" \ + >>"$LOG" 2>&1 & + QEMU_PID=$! + log "QEMU pid=$QEMU_PID" + + km_wait_socket "$MONITOR_SOCK" "$WAIT_MONITOR_S" \ + || fail "monitor socket never opened" + log "monitor socket ready" +} + +# Did QEMU die? Used at every poll; lets us bail with a useful message +# instead of waiting out the full timeout. +qemu_alive() { + [[ -n "${QEMU_PID:-}" ]] && kill -0 "$QEMU_PID" 2>/dev/null +} + +# ── Driver: walk the installer flow ──────────────────────────────────── + +# Take a numbered screenshot. Auto-increments NN. +SHOT_N=0 +shot() { + local label=$1 + SHOT_N=$((SHOT_N + 1)) + local file + file=$(printf '%s-%02d-%s.png' "$SHOT_PREFIX" "$SHOT_N" "$label") + km_screendump "$MONITOR_SOCK" "$file" + log "screenshot: $file" +} + +drive_installer() { + log "waiting ${WAIT_INSTALLER_BANNER_S}s for ISO boot + tty1 installer" + + # The live ISO autologs into multi-user.target, runs gum on tty1 via a + # systemd unit that replaces getty (see overlay/etc/systemd/system/ + # veilor-installer.service if it exists; otherwise via the multi-user + # default in kickstart line 250). + sleep "$WAIT_INSTALLER_BANNER_S" + qemu_alive || fail "QEMU died during ISO boot" + shot "boot-banner" + + # Make absolutely sure we're on tty1 (the live ks sets multi-user.target + # default, so we should already be there — but a stray graphical.target + # on dev builds would silently swallow our keystrokes). + km_send_chord "$MONITOR_SOCK" ctrl alt f1 + sleep "$WAIT_AFTER_INPUT_S" + shot "tty1" + + # Step 1: top option = "Install" — gum choose has it pre-selected. + log "step: select Install" + km_send_key "$MONITOR_SOCK" ret + sleep "$WAIT_GUM_PROMPT_S" + shot "after-install-pick" + + # Step 2: disk select — only /dev/vda exists in this QEMU. Default + # selection = first row. + log "step: select disk (/dev/vda — only one)" + km_send_key "$MONITOR_SOCK" ret + sleep "$WAIT_GUM_PROMPT_S" + shot "after-disk-pick" + + # Step 3: LUKS passphrase. gum input --password reads stdin until newline. + log "step: enter LUKS passphrase" + km_send_str "$MONITOR_SOCK" "$LUKS_PW" + sleep 1 + km_send_key "$MONITOR_SOCK" ret + sleep "$WAIT_AFTER_INPUT_S" + shot "after-luks-pw" + + # Step 4: admin password. + log "step: enter admin password" + km_send_str "$MONITOR_SOCK" "$ADMIN_PW" + sleep 1 + km_send_key "$MONITOR_SOCK" ret + sleep "$WAIT_AFTER_INPUT_S" + shot "after-admin-pw" + + # Step 5: locale select — first option = en_GB.UTF-8. + log "step: confirm locale (en_GB.UTF-8)" + km_send_key "$MONITOR_SOCK" ret + sleep "$WAIT_GUM_PROMPT_S" + shot "after-locale" + + # Step 6: confirm screen. gum confirm defaults to "Yes" focused → + # Enter accepts. (Verified against gum 0.13+ docs; if defaults change + # in a future gum, swap to explicit "y" via key map.) + log "step: confirm install" + km_send_key "$MONITOR_SOCK" ret + sleep "$WAIT_AFTER_INPUT_S" + shot "after-confirm" + + log "installer driven: anaconda should now be running" +} + +# Quick non-blocking SSH probe. Returns 0 if reachable. +ssh_alive() { + ssh -p $SSH_PORT \ + -o StrictHostKeyChecking=no \ + -o UserKnownHostsFile=/dev/null \ + -o ConnectTimeout=3 \ + -o BatchMode=yes \ + ${TEST_KEY:+-i "$TEST_KEY"} \ + "$SSH_USER@127.0.0.1" true 2>/dev/null +} + +# Poll for anaconda completion + SSH availability. We can't watch QEMU exit +# (anaconda's `reboot` directive triggers systemctl reboot, which doesn't +# poweroff the VM — it boots back into the installed disk). The signal we +# actually trust is SSH on port 2222 starting to answer. +# +# If cloud-init didn't run (the seed ISO might not have been picked up by +# anaconda's installed system, depending on whether /etc/cloud is in the +# installed package set), SSH will never come up via key auth. The fallback +# in tty1_unlock_ssh() drives the SDDM/console login by hand. +wait_for_install_and_reboot() { + log "waiting up to ${ANACONDA_TIMEOUT_S}s for anaconda + reboot + SSH" + + local waited=0 last_shot=0 last_ppm_hash="" same_count=0 + while (( waited < ANACONDA_TIMEOUT_S )); do + if ! qemu_alive; then + fail "QEMU exited unexpectedly during install (check $SERIAL_LOG)" + fi + + # SSH probe — first PASS exits the loop. + if ssh_alive; then + log "SSH up — installed system reachable" + shot "ssh-up" + return 0 + fi + + # Periodic screenshot + stuck-screen detection. + if (( waited - last_shot >= ANACONDA_POLL_S )); then + local ppm="$SHOT_PREFIX-poll.ppm" + km_monitor_send "$MONITOR_SOCK" "screendump $ppm" + sleep 1 + if [[ -f $ppm ]]; then + local h + h=$(sha256sum "$ppm" 2>/dev/null | cut -d' ' -f1) + if [[ -n $last_ppm_hash && $h == "$last_ppm_hash" ]]; then + same_count=$((same_count + 1)) + else + same_count=0 + fi + last_ppm_hash=$h + rm -f "$ppm" + fi + # 5 minutes of identical frames = stuck. Anaconda's text-mode + # progress refreshes at least every minute, so 10 frames in a + # row (5 min @ 30s cadence) identical means it's wedged. + if (( same_count >= 10 )); then + shot "stuck" + fail "screen unchanged for 5min — anaconda likely crashed" + fi + last_shot=$waited + log "anaconda still running... (${waited}s elapsed)" + fi + + sleep 5 + waited=$((waited + 5)) + done + + shot "ssh-timeout" + log "SSH never came up via cloud-init; trying TTY1 fallback" + if tty1_unlock_ssh; then + log "TTY1 fallback succeeded; SSH should be reachable" + return 0 + fi + fail "anaconda did not complete + SSH within ${ANACONDA_TIMEOUT_S}s, TTY1 fallback also failed" +} + +# TTY1 fallback: the installed system reached SDDM (graphical) or got stuck +# at LUKS prompt. We drop to a TTY, log in as admin (chage forces password +# change on first use), and undo the sshd hardening so our pubkey works. +# +# This is best-effort. If the LUKS prompt is still up — we can't get past +# it without typing the passphrase, which we do here too. +tty1_unlock_ssh() { + log "TTY1 fallback: typing LUKS passphrase + admin login + opening sshd" + + # Switch to tty1 in case SDDM grabbed graphical. + km_send_chord "$MONITOR_SOCK" ctrl alt f3 + sleep 3 + + # If we're at LUKS prompt, the passphrase clears it. If we're already + # past LUKS, this is a harmless garbage on the login prompt — we Enter + # to clear, then proceed with login. + km_send_str "$MONITOR_SOCK" "$LUKS_PW" + km_send_key "$MONITOR_SOCK" ret + sleep 30 # cryptsetup unlock + boot to login prompt + + shot "tty3-prelogin" + + # Username — admin. chage -d 0 means we'll be prompted to change pw on + # first login. The old password is whatever we typed at install time; + # the new password just has to satisfy PAM minlen — reuse $ADMIN_PW + # and add a "1" suffix to make passwd's "must differ" check happy. + km_send_line "$MONITOR_SOCK" "admin" + sleep 3 + km_send_line "$MONITOR_SOCK" "$ADMIN_PW" + sleep 5 + # Old pw prompt (chage forced). + km_send_line "$MONITOR_SOCK" "$ADMIN_PW" + sleep 2 + # New pw twice. Use a derivative; PAM rejects identical-to-old and we + # don't want to surprise the user with a password change. + km_send_line "$MONITOR_SOCK" "${ADMIN_PW}new" + sleep 1 + km_send_line "$MONITOR_SOCK" "${ADMIN_PW}new" + sleep 5 + + shot "tty3-loggedin" + + # Inject host pubkey + remove sshd hardening + reload sshd. + local pubkey="" + if [[ -n "${TEST_KEY:-}" && -f "${TEST_KEY}.pub" ]]; then + pubkey=$(< "${TEST_KEY}.pub") + fi + if [[ -z $pubkey ]]; then + log "TTY1 fallback: no pubkey to inject — cannot recover SSH" + return 1 + fi + + km_send_line "$MONITOR_SOCK" "mkdir -p ~/.ssh && chmod 700 ~/.ssh" + sleep 1 + km_send_line "$MONITOR_SOCK" "echo '$pubkey' >> ~/.ssh/authorized_keys" + sleep 1 + km_send_line "$MONITOR_SOCK" "chmod 600 ~/.ssh/authorized_keys" + sleep 1 + km_send_line "$MONITOR_SOCK" "echo '${ADMIN_PW}new' | sudo -S rm -f /etc/ssh/sshd_config.d/10-veilor-hardening.conf" + sleep 2 + km_send_line "$MONITOR_SOCK" "echo '${ADMIN_PW}new' | sudo -S systemctl reload sshd" + sleep 5 + + # Wait up to 60s for SSH to actually answer. + local i + for ((i=0; i<60; i++)); do + if ssh_alive; then + log "TTY1 fallback: SSH reachable after ${i}s" + return 0 + fi + sleep 1 + done + return 1 +} + +# ── Validation ───────────────────────────────────────────────────────── +# Run a single SSH command, return its stdout. Failures are NOT fatal here +# — caller decides what's a hard failure. +remote() { + ssh -p $SSH_PORT \ + -o StrictHostKeyChecking=no \ + -o UserKnownHostsFile=/dev/null \ + -o BatchMode=yes \ + ${TEST_KEY:+-i "$TEST_KEY"} \ + "$SSH_USER@127.0.0.1" "$@" +} + +# Validation result accumulator. check_remote runs a shell snippet on the +# installed VM via SSH; the snippet must exit 0 for PASS, non-zero for +# FAIL. check_eq compares remote stdout to an expected literal. +VALIDATIONS=() + +# check_remote +# Runs the snippet via SSH, treats exit code as the verdict. +check_remote() { + local desc=$1 cmd=$2 + local out rc + out=$(remote "$cmd" 2>&1) + rc=$? + if (( rc == 0 )); then + VALIDATIONS+=("PASS $desc") + log " PASS: $desc" + else + # Truncate the failure context so the report stays scannable. + local trimmed=${out:0:120} + VALIDATIONS+=("FAIL $desc ($trimmed)") + log " FAIL: $desc -- $trimmed" + fi +} + +# check_eq +# Runs the snippet, trims trailing whitespace, compares to expected. +check_eq() { + local desc=$1 cmd=$2 expected=$3 + local got + got=$(remote "$cmd" 2>/dev/null | tr -d '\r' | tail -n1) + got=${got%[[:space:]]} + if [[ $got == "$expected" ]]; then + VALIDATIONS+=("PASS $desc (=$got)") + log " PASS: $desc (=$got)" + else + VALIDATIONS+=("FAIL $desc (got: '$got', expected: '$expected')") + log " FAIL: $desc -- got '$got' expected '$expected'" + fi +} + +run_validation() { + log "running validation checklist" + + # os-release + check_remote "/etc/os-release: NAME=veilor-os" \ + 'grep -q "^NAME=.veilor-os" /etc/os-release' + + check_eq "hostnamectl --static = veilor" \ + 'hostnamectl --static' "veilor" + + # Active services + for svc in sshd fail2ban usbguard tuned auditd firewalld chronyd sddm; do + check_eq "$svc is-active" \ + "systemctl is-active $svc" "active" + done + + # SELinux. v0.5.x kickstart sets `selinux --enforcing` for installed + # systems but veilor-firstboot may toggle behavior — accept either + # Enforcing or Permissive, but log which one we got. (Hard-fail on + # Disabled.) + local selinux + selinux=$(remote getenforce 2>/dev/null | tr -d '\r' | tail -n1) + selinux=${selinux%[[:space:]]} + if [[ $selinux == Enforcing ]]; then + VALIDATIONS+=("PASS SELinux = Enforcing") + log " PASS: SELinux = Enforcing" + elif [[ $selinux == Permissive ]]; then + VALIDATIONS+=("PASS SELinux = Permissive (acceptable for v0.5)") + log " PASS (soft): SELinux = Permissive" + else + VALIDATIONS+=("FAIL SELinux = $selinux") + log " FAIL: SELinux = $selinux" + fi + + # Disk layout: LUKS2 + btrfs. + check_remote "lsblk shows crypto_LUKS" \ + 'lsblk -f | grep -q crypto_LUKS' + check_remote "lsblk shows btrfs" \ + 'lsblk -f | grep -q btrfs' + check_remote "/etc/crypttab has LUKS entry" \ + 'grep -Ev "^\s*(#|$)" /etc/crypttab | grep -qi luks' + + # Admin user + check_remote "admin user exists" \ + 'getent passwd admin | grep -q "^admin:"' + + # CLI tools shipped via overlay. + for bin in veilor-power veilor-doctor veilor-update; do + check_remote "/usr/local/bin/$bin present" \ + "test -x /usr/local/bin/$bin" + done + + # init_on_alloc — veilor-installer kickstart sets it on the install + # cmdline (line 315). /proc/cmdline is the source of truth. + check_remote "init_on_alloc=1 in /proc/cmdline" \ + 'grep -q init_on_alloc=1 /proc/cmdline' +} + +# ── Reporting ────────────────────────────────────────────────────────── +print_report() { + local pass=0 fail=0 + for line in "${VALIDATIONS[@]}"; do + case "$line" in + PASS*) pass=$((pass + 1)) ;; + FAIL*) fail=$((fail + 1)) ;; + esac + done + + { + echo "════════════════════════════════════════════════════════" + echo " veilor-os auto-install test report" + echo " $(date)" + echo "════════════════════════════════════════════════════════" + printf '%s\n' "${VALIDATIONS[@]}" + echo "────────────────────────────────────────────────────────" + printf 'TOTAL: %d PASS, %d FAIL\n' "$pass" "$fail" + echo "Logs: $LOG" + echo "Screenshots: ${SHOT_PREFIX}-NN-*.png" + echo "Serial log: $SERIAL_LOG" + echo "════════════════════════════════════════════════════════" + } | tee -a "$LOG" + + # Capture a final SSH session snapshot (uname/lsblk/sysctl) for the + # human reviewer. + { + echo "=== final ssh probe ===" + date + echo "--- uname -a ---" + remote uname -a 2>&1 + echo "--- lsblk -f ---" + remote lsblk -f 2>&1 + echo "--- /proc/cmdline ---" + remote cat /proc/cmdline 2>&1 + echo "--- systemctl --failed ---" + remote systemctl --failed 2>&1 + } > "${SHOT_PREFIX}-final-ssh.txt" 2>&1 || true + log "final ssh snapshot: ${SHOT_PREFIX}-final-ssh.txt" + + if (( fail > 0 )); then + return 1 + fi + return 0 +} + +cleanup() { + log "cleanup" + if [[ -n "${QEMU_PID:-}" ]] && kill -0 "$QEMU_PID" 2>/dev/null; then + # Graceful shutdown via monitor first; SIGTERM if it ignores us. + km_monitor_send "$MONITOR_SOCK" "system_powerdown" 2>/dev/null || true + sleep 5 + if kill -0 "$QEMU_PID" 2>/dev/null; then + kill "$QEMU_PID" 2>/dev/null || true + sleep 2 + kill -9 "$QEMU_PID" 2>/dev/null || true + fi + fi + rm -f "$MONITOR_SOCK" +} + +# ── Main ─────────────────────────────────────────────────────────────── +main() { + trap cleanup EXIT + + preflight "$@" + kill_existing_vm + wipe_state + build_seed_iso + launch_vm "$ISO" + drive_installer + wait_for_install_and_reboot + run_validation + print_report +} + +main "$@"