minecraft-server/scripts/backup.sh
s8n 4c16cebb2b backup: phase 1 + phase 2 scripts; daily script repaired and deployed
Repairs the orphaned synapse-signing-key block at scripts/backup.sh
lines 119-122 that was exiting the script under set -e before the
Minecraft block could run, leaving 5 of the last 7 days without a
world backup and zero usable snapshots after 7-day retention.

Phase 1 (deployed today to /opt/docker/backup.sh on nullstone):
- Repaired script — orphan block removed, MC arm wrapped so failures
  in one tar don't kill the run
- tar exit code 1 ("file changed as we read it") now treated as
  success on the live MC world; spark profiler tmp file noise
  silenced via --ignore-failed-read --warning=no-file-changed
- Plugin DBs (homestead, AuthMe, CoreProtect, LuckPerms) and configs
  now backed up alongside the world
- Sentinel /opt/backups/.last-success stamped only when the world
  arm succeeds — gives outside monitors a single mtime to alert on
- Manually verified end-to-end: 12G world tarball, 492M plugins,
  279M dbs, 14 config files, sentinel updated. Pre-fix script saved
  at /opt/docker/backup.sh.bak-20260507-pre-phase1.

Phase 2 (scripts in repo, deployment pending operator sudo):
- scripts/restic-backup-playerdata.sh — Class A 5-min restic snapshots
  of playerdata/, stats/, advancements/, plugin DBs, LuckPerms;
  rcon save-all flush before snapshot; tag-scoped retention
- scripts/restic-init.sh — one-time bootstrap (root-only) for
  /etc/mc-backup.{env,pw} + repo init at /home/user/restic/
- scripts/systemd/mc-backup-playerdata.{service,timer} — 5-min timer
  with hardening (ProtectSystem=strict, ReadOnlyPaths, etc)
- docs/RUNBOOK-BACKUP-RESTORE.md updated with both phases'
  deployment steps and the operator-action checklist

Off-host mirror to onyx (Phase 4) and class B/C/D world snapshots
(Phase 3) are still TODO — see BACKUP-STRATEGY.md §11 phase plan.
2026-05-07 18:29:30 +01:00

268 lines
12 KiB
Bash
Executable file

#!/usr/bin/env bash
# /opt/docker/backup.sh
#
# Daily backup of all Docker service databases, named volumes, and the
# Minecraft world to /opt/backups/. Runs as root via cron at 02:00 with
# 7-day retention.
#
# Phase 1 of BACKUP-STRATEGY.md ("stop the bleeding") — repairs the
# orphaned synapse-signing-key block that was killing the script under
# `set -e` before the Minecraft section ran. Also adds structured
# logging and a sentinel `.last-success` file so silent failures are
# detectable from outside the script.
#
# A separate Phase 2 (restic playerdata snapshots every 5 min) is
# delivered by scripts/restic-backup-playerdata.sh + the systemd unit
# pair under scripts/systemd/. This file remains the safety net.
set -euo pipefail
umask 077
BACKUP_DIR="/opt/backups"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BACKUP_PATH="${BACKUP_DIR}/${TIMESTAMP}"
LOG="${BACKUP_DIR}/backup.log"
SENTINEL="${BACKUP_DIR}/.last-success"
KEEP_DAYS=7
# Track whether each backup arm succeeded so we can honour the
# sentinel contract: only stamp .last-success if the *world* (the
# critical T1 case) was captured. Other arms can fail without
# blocking the sentinel — they have their own logged FAILED lines.
MC_WORLD_OK=0
log() {
printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" | tee -a "$LOG"
}
mkdir -p "$BACKUP_PATH"
log "=== Backup started: ${TIMESTAMP} ==="
# ── Matrix PostgreSQL ──────────────────────────────────────────────
log "Dumping Matrix PostgreSQL..."
if docker ps --format '{{.Names}}' | grep -q '^matrix-postgres$'; then
if docker exec matrix-postgres pg_dump -U synapse synapse \
| gzip > "${BACKUP_PATH}/matrix-postgres-${TIMESTAMP}.sql.gz"; then
log " Matrix Postgres: OK ($(du -sh "${BACKUP_PATH}/matrix-postgres-${TIMESTAMP}.sql.gz" | cut -f1))"
else
log " Matrix Postgres: FAILED"
fi
else
log " matrix-postgres not running — skipping"
fi
# ── Rocket.Chat MongoDB ────────────────────────────────────────────
log "Dumping Rocket.Chat MongoDB..."
if docker ps --format '{{.Names}}' | grep -q '^mongodb$'; then
if docker exec mongodb mongodump \
-u admin -p CHANGE_ME_MONGO_ADMIN_PASSWORD \
--authenticationDatabase admin \
--db rocketchat \
--archive \
| gzip > "${BACKUP_PATH}/rocketchat-mongo-${TIMESTAMP}.archive.gz"; then
log " MongoDB: OK ($(du -sh "${BACKUP_PATH}/rocketchat-mongo-${TIMESTAMP}.archive.gz" | cut -f1))"
else
log " MongoDB: FAILED"
fi
else
log " mongodb not running — skipping"
fi
# ── Named Docker volumes ───────────────────────────────────────────
log "Backing up Docker volumes..."
for VOLUME in synapse-media rocketchat-uploads; do
if docker volume ls --format '{{.Name}}' | grep -q "^matrix_${VOLUME}\|^rocketchat_${VOLUME}\|^${VOLUME}$"; then
ACTUAL_VOL=$(docker volume ls --format '{{.Name}}' | grep "${VOLUME}" | head -1)
if docker run --rm \
-v "${ACTUAL_VOL}:/volume:ro" \
-v "${BACKUP_PATH}:/backup" \
alpine \
tar czf "/backup/${VOLUME}-${TIMESTAMP}.tar.gz" -C /volume . ; then
log " Volume ${VOLUME}: OK"
else
log " Volume ${VOLUME}: FAILED"
fi
else
log " Volume ${VOLUME}: not found — skipping"
fi
done
# ── Config files (bind mounts) ─────────────────────────────────────
log "Backing up config directories..."
if tar czf "${BACKUP_PATH}/configs-${TIMESTAMP}.tar.gz" \
/opt/docker/traefik/traefik.yml \
/opt/docker/traefik/config/ \
/opt/docker/matrix/docker-compose.yml \
/opt/docker/matrix/element-config/ \
/opt/docker/matrix/synapse-config/homeserver.yaml \
/opt/docker/matrix/synapse-config/matrix.example.com.log.config \
/opt/docker/rocketchat/docker-compose.yml \
2>/dev/null; then
log " Configs: OK"
else
log " Configs: partial (some files missing)"
fi
# Synapse signing key — sensitive, copy out separately with tight perms.
if [ -f /opt/docker/matrix/synapse-config/matrix.example.com.signing.key ]; then
cp /opt/docker/matrix/synapse-config/matrix.example.com.signing.key \
"${BACKUP_PATH}/synapse-signing-key-${TIMESTAMP}.key"
chmod 600 "${BACKUP_PATH}/synapse-signing-key-${TIMESTAMP}.key"
log " Synapse signing key: backed up (600)"
fi
# ── Minecraft server ───────────────────────────────────────────────
# This is the block that was missing from the deployed copy and
# corrupted by an orphaned synapse-signing-key fragment in the repo
# copy. Wrapped in a subshell so a failure here does NOT exit the
# whole script under `set -e` — we want the prune step and sentinel
# logic to still run.
log "Backing up Minecraft server..."
# tar exit codes: 0 = clean, 1 = "some files differed/changed during read"
# (NORMAL on a live MC server — chunks save while we read), 2 = fatal.
# Treat 0 and 1 as success, 2+ as failure.
tar_ok() { local rc=$1; [ "$rc" -le 1 ]; }
mc_backup() {
if docker ps --format '{{.Names}}' | grep -q '^minecraft-mc$'; then
# Server running — flush via rcon if mcrcon installed, then
# tar inside the container so we get a consistent point-in-time.
if command -v mcrcon >/dev/null 2>&1; then
mcrcon -H 127.0.0.1 -P 25575 \
-p "${MC_RCON_PASSWORD:-*redacted*}" \
-w 1 "save-all flush" >/dev/null 2>&1 || true
fi
# World tar — runs inside the container. We ignore tar exit 1
# ("file changed as we read it") because that's expected on a
# live server and the resulting archive is still usable.
local tar_rc=0
docker exec minecraft-mc bash -c \
"cd /data && tar czf /tmp/mc-world-backup-${TIMESTAMP}.tar.gz world/ world_nether/ world_the_end/" \
>/dev/null 2>&1 || tar_rc=$?
if tar_ok "$tar_rc" \
&& docker cp "minecraft-mc:/tmp/mc-world-backup-${TIMESTAMP}.tar.gz" "${BACKUP_PATH}/" >/dev/null 2>&1 \
&& docker exec minecraft-mc rm -f "/tmp/mc-world-backup-${TIMESTAMP}.tar.gz" >/dev/null 2>&1; then
local sz
sz=$(du -sh "${BACKUP_PATH}/mc-world-backup-${TIMESTAMP}.tar.gz" | cut -f1)
if [ "$tar_rc" -eq 1 ]; then
log " Minecraft world: OK (${sz}) [tar exit 1 — files changed during read, expected on live server]"
else
log " Minecraft world: OK (${sz})"
fi
MC_WORLD_OK=1
else
log " Minecraft world: FAILED (tar_rc=${tar_rc})"
# Best-effort cleanup of any half-written file inside the container.
docker exec minecraft-mc rm -f "/tmp/mc-world-backup-${TIMESTAMP}.tar.gz" >/dev/null 2>&1 || true
fi
# Plugins (jars + on-disk config) — small, do this regardless
# of world result so we always have plugin state on hand.
# `--ignore-failed-read` suppresses spark profiler tmp files
# (running JFR files briefly mode 600); `--warning=no-file-changed`
# silences CoreProtect db noise in the log.
local prc=0
tar --ignore-failed-read --warning=no-file-changed \
-czf "${BACKUP_PATH}/minecraft-plugins-${TIMESTAMP}.tar.gz" \
-C /opt/docker/minecraft plugins/ >/dev/null 2>&1 || prc=$?
if tar_ok "$prc"; then
log " Minecraft plugins: OK ($(du -sh "${BACKUP_PATH}/minecraft-plugins-${TIMESTAMP}.tar.gz" | cut -f1))"
else
log " Minecraft plugins: FAILED (rc=${prc})"
fi
# Plugin DBs — copied (not dumped, all SQLite/file-based) into
# a tagged tarball so restore is straightforward.
local drc=0
tar --ignore-failed-read --warning=no-file-changed \
-czf "${BACKUP_PATH}/minecraft-dbs-${TIMESTAMP}.tar.gz" \
-C /opt/docker/minecraft \
homestead_data.db \
plugins/AuthMe/authme.db \
plugins/CoreProtect/database.db \
plugins/LuckPerms/ \
>/dev/null 2>&1 || drc=$?
if tar_ok "$drc"; then
log " Minecraft DBs: OK ($(du -sh "${BACKUP_PATH}/minecraft-dbs-${TIMESTAMP}.tar.gz" | cut -f1))"
else
log " Minecraft DBs: partial (rc=${drc} — some files may be missing)"
fi
# Server-side configs and access lists. Some of these files are
# optional (eg whitelist.json absent when whitelisting is off).
# tar reports rc=2 for missing files, so we prefilter the list.
local cfg_files=()
for f in server.properties purpur.yml spigot.yml bukkit.yml \
commands.yml help.yml permissions.yml \
ops.json whitelist.json banned-players.json banned-ips.json \
usercache.json eula.txt docker-compose.yml; do
[ -e "/opt/docker/minecraft/$f" ] && cfg_files+=("$f")
done
local crc=0
tar czf "${BACKUP_PATH}/minecraft-configs-${TIMESTAMP}.tar.gz" \
-C /opt/docker/minecraft "${cfg_files[@]}" \
>/dev/null 2>&1 || crc=$?
if tar_ok "$crc"; then
log " Minecraft configs: OK (${#cfg_files[@]} files)"
else
log " Minecraft configs: FAILED (rc=${crc})"
fi
else
# Server stopped — back up everything from disk directly.
local frc=0
tar czf "${BACKUP_PATH}/minecraft-full-backup-${TIMESTAMP}.tar.gz" \
-C /opt/docker/minecraft \
world/ \
world_nether/ \
world_the_end/ \
plugins/ \
homestead_data.db \
server.properties \
purpur.yml \
spigot.yml \
bukkit.yml \
ops.json \
whitelist.json \
banned-players.json \
banned-ips.json \
usercache.json \
docker-compose.yml \
>/dev/null 2>&1 || frc=$?
if tar_ok "$frc"; then
log " Minecraft (full, offline): OK ($(du -sh "${BACKUP_PATH}/minecraft-full-backup-${TIMESTAMP}.tar.gz" | cut -f1))"
MC_WORLD_OK=1
else
log " Minecraft (offline): partial (rc=${frc})"
fi
fi
}
# Run MC arm — never let it kill the rest of the script.
if ! mc_backup; then
log " Minecraft arm exited non-zero — see lines above"
fi
# ── Prune old backups ──────────────────────────────────────────────
log "Pruning backups older than ${KEEP_DAYS} days..."
find "$BACKUP_DIR" -maxdepth 1 -type d -mtime "+${KEEP_DAYS}" -exec rm -rf {} + 2>/dev/null || true
find "$BACKUP_DIR" -maxdepth 1 -name "*.log" -mtime +30 -delete 2>/dev/null || true
BACKUP_SIZE=$(du -sh "$BACKUP_PATH" | cut -f1)
log "=== Backup complete: ${BACKUP_PATH} (${BACKUP_SIZE}) ==="
# ── Sentinel ───────────────────────────────────────────────────────
# Touch the sentinel only if the world (T1 case) was captured. An
# external monitor (cron on onyx, or ntfy/healthchecks once wired)
# can alert on `find /opt/backups/.last-success -mmin +1500` to catch
# silent failures within 25h of a missed daily run.
if [ "$MC_WORLD_OK" -eq 1 ]; then
{
printf 'last_success=%s\n' "$(date -Iseconds)"
printf 'backup_path=%s\n' "$BACKUP_PATH"
printf 'backup_size=%s\n' "$BACKUP_SIZE"
} > "$SENTINEL"
log "Sentinel updated: ${SENTINEL}"
else
log "WARNING: world backup did NOT succeed — sentinel NOT updated"
fi