diff --git a/kickstart/veilor-os.ks b/kickstart/veilor-os.ks index 2ac860d..183d267 100644 --- a/kickstart/veilor-os.ks +++ b/kickstart/veilor-os.ks @@ -275,42 +275,72 @@ compression-algorithm = zstd EOF # Patch anaconda's transaction_progress.py inside the live rootfs so that -# when the user clicks "Install" from the live ISO and anaconda runs in -# --cmdline mode, a non-fatal scriptlet warning (RC=5) does not get -# escalated to "An error occurred during the transaction" + abort. +# when the user clicks "Install", a non-fatal RPM 6.0 *scriptlet* warning +# does not get escalated to "An error occurred during the transaction" +# and abort. # -# Why this is needed: Fedora 43 ships RPM 6.0, which changed scriptlet -# failure propagation (Fedora wiki Changes/RPM-6.0; dnf5 issue #2507). -# Scriptlets that previously emitted "Non-critical error" warnings now -# bubble up as transaction-level errors. man-db's -# `transfiletriggerin` is the most common trigger — `systemd-run -# /usr/bin/systemctl start man-db-cache-update` returns non-zero in -# the anaconda chroot, RPM-6.0-aware dnf5 reports it as transaction -# error, anaconda --cmdline aborts. +# This patch is NARROW — it overrides ONLY the `script_error` callback, +# not the consumer (`process_transaction_progress`). v0.5.28 had a broad +# patch that turned EVERY 'error' token into a warning, including +# `cpio_error` (payload corruption) and `unpack_error` (extraction +# failures). Side effect: silent grub2-efi-x64 scriptlet failure → +# /boot/efi/EFI/fedora/ left incomplete → `gen_grub_cfgstub` failed at +# the bootloader install phase. Narrowing eliminates that class of +# silent failure. # -# We previously patched the same file on the BUILD HOST (build/build-iso.sh) -# so livecd-creator could finish its own transaction. That patch lives -# only on the host running the build — never landed in the live rootfs -# the user installs from. Reproduced 3 consecutive VM tests -# (v0.5.26 / v0.5.27 / v0.5.28) failing at exactly "Configuring -# man-db.x86_64". +# Why a patch is needed at all: Fedora 43 ships RPM 6.0, which changed +# scriptlet failure propagation (Fedora wiki Changes/RPM-6.0; dnf5 issue +# 2507). Scriptlets that previously emitted "Non-critical error" +# warnings now bubble up as transaction-level errors. man-db's +# `transfiletriggerin` (`systemd-run /usr/bin/systemctl start +# man-db-cache-update`) is the most common trigger — non-zero in the +# anaconda chroot, RPM-6.0-aware dnf5 reports as error, anaconda +# --cmdline aborts. # -# The patch downgrades the 'error' token in transaction progress -# callback to a warning log line. Confirmed working at build time -# (build/build-iso.sh:47-51). +# After the patch: +# - script_error → log warning, do NOT enqueue 'error' (transaction +# continues; specific package's posttrans whose result we ignore is +# already in the install set, scriptlet has run as far as it can). +# - cpio_error / unpack_error / generic error → unchanged, still +# raise PayloadInstallationError as anaconda intends. Real +# transaction-fatal events still abort install (good). TP=/usr/lib64/python3.14/site-packages/pyanaconda/modules/payloads/payload/dnf/transaction_progress.py if [ -f "$TP" ]; then cp -a "$TP" "${TP}.veilor-bak" - sed -i 's|raise PayloadInstallationError("An error occurred during the transaction: " + msg)|log.warning("veilor: ignoring non-fatal transaction error: %s", msg)|' "$TP" - if grep -q 'veilor: ignoring' "$TP"; then - echo "[OK] transaction_progress.py patched in live rootfs" + + # Replace the script_error self._queue.put(('error', ...)) line with a + # warning log + return. The script_error method is uniquely identified + # by its `return_code` argument; sed targets that line specifically. + # `python3 -c` block is more robust than nested sed across multi-line + # statements; rewrite the whole script_error method body. + python3 - "$TP" <<'PYEOF' +import sys, re +path = sys.argv[1] +src = open(path).read() +# Find the script_error method and replace the queue.put(...) line at its end +new = re.sub( + r'( def script_error\(self, item, nevra, type, return_code\):.*?)\n self\._queue\.put\(\(.error., item\.get_package\(\)\.to_string\(\)\)\)', + r'\1\n log.warning("veilor: ignoring non-fatal scriptlet failure rc=%s for %s",\n return_code,\n item.get_package().to_string() if item else "unknown")\n # do NOT enqueue \'error\' — let install continue (RPM 6.0 cmdline regression workaround)', + src, + flags=re.DOTALL, + count=1, +) +if new == src: + print("[ERR] script_error method not found in expected form — anaconda layout changed") + sys.exit(1) +open(path, "w").write(new) +print("[OK] transaction_progress.py: narrowed script_error override") +PYEOF + + if grep -q "veilor: ignoring non-fatal scriptlet" "$TP"; then # Drop the cached .pyc so the patched .py is what runs. rm -f /usr/lib64/python3.14/site-packages/pyanaconda/modules/payloads/payload/dnf/__pycache__/transaction_progress.*.pyc 2>/dev/null || true + echo "[OK] anaconda transaction_progress.py patched in live rootfs (script_error only)" else - echo "[WARN] transaction_progress.py patch did not apply — file format may have changed in this anaconda version" + echo "[WARN] transaction_progress.py patch did not apply — anaconda layout may have changed" fi else - echo "[WARN] transaction_progress.py not found at expected path — anaconda may have moved it" + echo "[WARN] transaction_progress.py not found at expected path" fi # Enable services diff --git a/overlay/usr/local/bin/veilor-installer b/overlay/usr/local/bin/veilor-installer index acc6f58..10f1c32 100644 --- a/overlay/usr/local/bin/veilor-installer +++ b/overlay/usr/local/bin/veilor-installer @@ -397,8 +397,22 @@ user --name=admin --groups=wheel --gecos="veilor admin" --password=__ADMIN_PW__ __SSHKEY_DIRECTIVE__ # Full hardening cmdline (installed system, not live): -# --location=none: anaconda auto-places bootloader (UEFI grub2-efi or BIOS). -bootloader --append="lockdown=integrity slab_nomerge init_on_alloc=1 init_on_free=1 randomize_kstack_offset=on vsyscall=none" +# - `lockdown=integrity` — kernel lockdown, integrity mode (signed module enforce) +# - `slab_nomerge` — refuse SLAB merging; harder heap-spray attacks +# - `init_on_alloc=1 init_on_free=1` — zero pages on alloc + free; defends +# uninit-read class; ~5% perf hit acceptable on hardened workstation +# - `randomize_kstack_offset=on` — KASLR for kernel stack, per-syscall +# - `vsyscall=none` — kill legacy vsyscall page (Position-Independent +# ROP-gadget surface) +# - `fbcon=nodefer` — keep linux framebuffer console alive through KMS +# handoff so plymouth LUKS prompt and any boot-time text remain +# visible on real GPU drivers (intel/amdgpu/nvidia). Already in live +# ISO cmdline; was previously missing from installed-system cmdline, +# which produced a black-screen boot on real hardware until KMS +# stabilised. +# Anaconda picks bootloader location (UEFI ESP or BIOS MBR) automatically; +# `--location=mbr` would be cargo-cult on UEFI and risky on multi-disk. +bootloader --append="lockdown=integrity slab_nomerge init_on_alloc=1 init_on_free=1 randomize_kstack_offset=on vsyscall=none fbcon=nodefer" # Disk: zero, LUKS2 (argon2id), btrfs subvolumes (no LVM intermediary). # Native btrfs-on-LUKS matches Fedora KDE Spin defaults; LVM+btrfs combo @@ -608,7 +622,26 @@ sed -i \ # user lands in emergency shell on first boot. LUKS_UUID=$(blkid -t TYPE=crypto_LUKS -o value -s UUID 2>/dev/null | head -1) if [ -n "$LUKS_UUID" ]; then - LUKS_ARGS="rd.luks.uuid=luks-${LUKS_UUID}" + # Args: + # rd.luks.uuid=luks-XXX — tells dracut to expect a LUKS device, + # triggers cryptsetup-generator. + # rd.luks.options=...=tries=5 — five typo retries before giving up + # (default 1; one slip = emergency + # shell after 3min, terrible UX). + # rd.luks.options=...=timeout=0 — never time out unlock device wait + # (default 1m30s; slow user typing + # on a long passphrase still works). + # fbcon=nodefer — keep linux framebuffer console alive + # through KMS handoff. Without this on + # real laptops the plymouth LUKS prompt + # draws into a frozen framebuffer and + # the user sees a black screen with a + # blinking cursor. Already in the live + # ISO bootloader cmdline; missing from + # the installed-system bootloader line + # in the generated install ks above + # (also fixed there). + LUKS_ARGS="rd.luks.uuid=luks-${LUKS_UUID} rd.luks.options=luks-${LUKS_UUID}=tries=5,timeout=0 fbcon=nodefer" # Path 1: persist into /etc/default/grub so future kernels inherit. if ! grep -q "rd.luks.uuid" /etc/default/grub 2>/dev/null; then @@ -620,16 +653,54 @@ if [ -n "$LUKS_UUID" ]; then # the `options` line in-place. grubby --update-kernel=ALL --args="${LUKS_ARGS}" 2>&1 | tail -5 || true + # Verification: every BLS entry MUST carry the LUKS arg now. Empty + # output = success. + drift=$(grep -L "rd.luks.uuid" /boot/loader/entries/*.conf 2>/dev/null) + if [ -n "$drift" ]; then + echo "[WARN] BLS entries missing rd.luks.uuid: $drift" + fi + echo "[INFO] injected ${LUKS_ARGS} into /etc/default/grub + BLS entries" fi +# Verify anaconda wrote /etc/crypttab for the LUKS device. anaconda's +# custom-partitioning code path normally does this for `--encrypted` +# part directives; if it didn't (edge case, F43+ regressions), write +# a minimal entry so systemd-cryptsetup-generator can find the device +# at boot from the BLS args alone. +if [ -n "$LUKS_UUID" ] && ! grep -q "$LUKS_UUID" /etc/crypttab 2>/dev/null; then + echo "luks-${LUKS_UUID} UUID=${LUKS_UUID} none discard" >> /etc/crypttab + echo "[INFO] wrote /etc/crypttab fallback entry" +fi + # Switch plymouth to text-only `details` theme (scrolling boot log, no # graphics, no logo). Theme is built-in to plymouth package, no asset # install needed. v0.6 will ship custom veilor-themed plymouth. plymouth-set-default-theme details 2>/dev/null || true -# Regenerate initramfs with new theme baked in (plymouth modules read -# theme at initramfs build time). -dracut --force --regenerate-all 2>&1 | tail -3 || true + +# Force-include LUKS + plymouth modules in initramfs. dracut autodetects +# crypt+plymouth from the running config, but custom-partitioning %post +# runs before dracut sees stable LUKS state, and stale initramfs files +# from anaconda's pre-install kernel may persist. Belt-and-braces. +mkdir -p /etc/dracut.conf.d +cat > /etc/dracut.conf.d/10-veilor-luks.conf <<'DRACUTEOF' +# veilor-os: guarantee LUKS + plymouth modules in initramfs +add_dracutmodules+=" crypt systemd-cryptsetup plymouth " +DRACUTEOF + +# Regenerate initramfs with new theme + dracut.conf.d picks. Remove +# stale initramfs first so the regen actually rewrites bytes. +rm -f /boot/initramfs-*.img 2>/dev/null || true +dracut --force --regenerate-all 2>&1 | tail -5 || true + +# Verify cryptsetup landed in initramfs. If not, LUKS unlock is impossible +# and the user gets emergency shell on first boot. Surfacing this early. +KVER=$(ls /lib/modules | head -1) +if [ -n "$KVER" ] && [ -f "/boot/initramfs-${KVER}.img" ]; then + if ! lsinitrd "/boot/initramfs-${KVER}.img" 2>/dev/null | grep -q cryptsetup; then + echo "[ERR] cryptsetup not found in initramfs — LUKS unlock will fail" + fi +fi # Regen grub.cfg with new branding (anaconda already wrote one; replace). grub2-mkconfig -o /boot/grub2/grub.cfg 2>/dev/null || true