diff --git a/common/nfs-services-server.nix b/common/nfs-services-server.nix index f0848e6..842ffd4 100644 --- a/common/nfs-services-server.nix +++ b/common/nfs-services-server.nix @@ -103,11 +103,14 @@ in ] ++ (lib.forEach cfg.standbys (standby: { "replicate-services-to-${standby}" = { description = "Replicate /persist/services to ${standby}"; - path = [ pkgs.btrfs-progs pkgs.openssh pkgs.coreutils pkgs.findutils pkgs.gnugrep ]; + path = [ pkgs.btrfs-progs pkgs.openssh pkgs.coreutils pkgs.findutils pkgs.gnugrep pkgs.curl ]; script = '' set -euo pipefail + START_TIME=$(date +%s) + REPLICATION_SUCCESS=0 + SSH_KEY="/persist/root/.ssh/btrfs-replication" if [ ! -f "$SSH_KEY" ]; then echo "ERROR: SSH key not found at $SSH_KEY" @@ -134,11 +137,13 @@ in ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \ "btrfs receive /persist/services-standby"; then echo "Incremental send completed successfully" + REPLICATION_SUCCESS=1 else echo "Incremental send failed (likely missing parent on receiver), falling back to full send" btrfs send "$SNAPSHOT_PATH" | \ ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \ "btrfs receive /persist/services-standby" + REPLICATION_SUCCESS=1 fi else # First snapshot, do full send @@ -146,10 +151,28 @@ in btrfs send "$SNAPSHOT_PATH" | \ ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \ "btrfs receive /persist/services-standby" + REPLICATION_SUCCESS=1 fi - # Cleanup old snapshots on sender (keep last 24 hours = 288 snapshots at 5min intervals) - find /persist -maxdepth 1 -name 'services@*' -mmin +1440 -exec btrfs subvolume delete {} \; + # Cleanup old snapshots on sender (keep last 10 snapshots, sorted by name/timestamp) + ls -1d /persist/services@* 2>/dev/null | sort | head -n -10 | xargs -r btrfs subvolume delete + + # Calculate metrics + END_TIME=$(date +%s) + DURATION=$((END_TIME - START_TIME)) + SNAPSHOT_COUNT=$(ls -1d /persist/services@* 2>/dev/null | wc -l) + + # Push metrics to Prometheus pushgateway + cat </dev/null | sort | head -n -10 | xargs -r btrfs subvolume delete || true - # Count existing snapshots - count=$(find /persist/services-standby -maxdepth 1 -name 'services@*' -type d | wc -l) + # Calculate metrics + CLEANUP_TIME=$(date +%s) + SNAPSHOT_COUNT=$(ls -1d /persist/services-standby/services@* 2>/dev/null | wc -l) - # Only delete old snapshots if we have more than the minimum - if [ $count -gt $MIN_KEEP ]; then - # Delete snapshots older than 4 hours - find /persist/services-standby -maxdepth 1 -name 'services@*' -mmin +240 -exec btrfs subvolume delete {} \; || true - else - echo "Only $count snapshots found, keeping all (minimum: $MIN_KEEP)" - fi + # Push metrics to Prometheus pushgateway + cat <