Only keep 10 snapshots, and push metrics.

This commit is contained in:
2025-11-04 10:22:11 +00:00
parent 49afc0c084
commit 5ef4d832fb
2 changed files with 40 additions and 16 deletions

View File

@@ -103,11 +103,14 @@ in
] ++ (lib.forEach cfg.standbys (standby: { ] ++ (lib.forEach cfg.standbys (standby: {
"replicate-services-to-${standby}" = { "replicate-services-to-${standby}" = {
description = "Replicate /persist/services to ${standby}"; description = "Replicate /persist/services to ${standby}";
path = [ pkgs.btrfs-progs pkgs.openssh pkgs.coreutils pkgs.findutils pkgs.gnugrep ]; path = [ pkgs.btrfs-progs pkgs.openssh pkgs.coreutils pkgs.findutils pkgs.gnugrep pkgs.curl ];
script = '' script = ''
set -euo pipefail set -euo pipefail
START_TIME=$(date +%s)
REPLICATION_SUCCESS=0
SSH_KEY="/persist/root/.ssh/btrfs-replication" SSH_KEY="/persist/root/.ssh/btrfs-replication"
if [ ! -f "$SSH_KEY" ]; then if [ ! -f "$SSH_KEY" ]; then
echo "ERROR: SSH key not found at $SSH_KEY" echo "ERROR: SSH key not found at $SSH_KEY"
@@ -134,11 +137,13 @@ in
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \ ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
"btrfs receive /persist/services-standby"; then "btrfs receive /persist/services-standby"; then
echo "Incremental send completed successfully" echo "Incremental send completed successfully"
REPLICATION_SUCCESS=1
else else
echo "Incremental send failed (likely missing parent on receiver), falling back to full send" echo "Incremental send failed (likely missing parent on receiver), falling back to full send"
btrfs send "$SNAPSHOT_PATH" | \ btrfs send "$SNAPSHOT_PATH" | \
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \ ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
"btrfs receive /persist/services-standby" "btrfs receive /persist/services-standby"
REPLICATION_SUCCESS=1
fi fi
else else
# First snapshot, do full send # First snapshot, do full send
@@ -146,10 +151,28 @@ in
btrfs send "$SNAPSHOT_PATH" | \ btrfs send "$SNAPSHOT_PATH" | \
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \ ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
"btrfs receive /persist/services-standby" "btrfs receive /persist/services-standby"
REPLICATION_SUCCESS=1
fi fi
# Cleanup old snapshots on sender (keep last 24 hours = 288 snapshots at 5min intervals) # Cleanup old snapshots on sender (keep last 10 snapshots, sorted by name/timestamp)
find /persist -maxdepth 1 -name 'services@*' -mmin +1440 -exec btrfs subvolume delete {} \; ls -1d /persist/services@* 2>/dev/null | sort | head -n -10 | xargs -r btrfs subvolume delete
# Calculate metrics
END_TIME=$(date +%s)
DURATION=$((END_TIME - START_TIME))
SNAPSHOT_COUNT=$(ls -1d /persist/services@* 2>/dev/null | wc -l)
# Push metrics to Prometheus pushgateway
cat <<METRICS | curl --data-binary @- http://pushgateway.service.consul:9091/metrics/job/nfs_replication/instance/${standby}
# TYPE nfs_replication_last_success_timestamp gauge
nfs_replication_last_success_timestamp $END_TIME
# TYPE nfs_replication_duration_seconds gauge
nfs_replication_duration_seconds $DURATION
# TYPE nfs_replication_snapshot_count gauge
nfs_replication_snapshot_count $SNAPSHOT_COUNT
# TYPE nfs_replication_success gauge
nfs_replication_success $REPLICATION_SUCCESS
METRICS
''; '';
serviceConfig = { serviceConfig = {

View File

@@ -39,26 +39,27 @@ in
noCheck = true; noCheck = true;
}; };
# Cleanup old snapshots on standby (keep last 4 hours for HA failover) # Cleanup old snapshots on standby (keep last 10 snapshots)
systemd.services.cleanup-services-standby-snapshots = { systemd.services.cleanup-services-standby-snapshots = {
description = "Cleanup old btrfs snapshots in services-standby"; description = "Cleanup old btrfs snapshots in services-standby";
path = [ pkgs.btrfs-progs pkgs.findutils pkgs.coreutils ]; path = [ pkgs.btrfs-progs pkgs.findutils pkgs.coreutils pkgs.curl ];
script = '' script = ''
set -euo pipefail set -euo pipefail
# Keep at least 2 hours of snapshots (24 snapshots at 5min intervals) # Cleanup old snapshots on standby (keep last 10 snapshots, sorted by name/timestamp)
MIN_KEEP=24 ls -1d /persist/services-standby/services@* 2>/dev/null | sort | head -n -10 | xargs -r btrfs subvolume delete || true
# Count existing snapshots # Calculate metrics
count=$(find /persist/services-standby -maxdepth 1 -name 'services@*' -type d | wc -l) CLEANUP_TIME=$(date +%s)
SNAPSHOT_COUNT=$(ls -1d /persist/services-standby/services@* 2>/dev/null | wc -l)
# Only delete old snapshots if we have more than the minimum # Push metrics to Prometheus pushgateway
if [ $count -gt $MIN_KEEP ]; then cat <<METRICS | curl --data-binary @- http://pushgateway.service.consul:9091/metrics/job/nfs_standby_cleanup/instance/$(hostname)
# Delete snapshots older than 4 hours # TYPE nfs_standby_snapshot_count gauge
find /persist/services-standby -maxdepth 1 -name 'services@*' -mmin +240 -exec btrfs subvolume delete {} \; || true nfs_standby_snapshot_count $SNAPSHOT_COUNT
else # TYPE nfs_standby_cleanup_last_run_timestamp gauge
echo "Only $count snapshots found, keeping all (minimum: $MIN_KEEP)" nfs_standby_cleanup_last_run_timestamp $CLEANUP_TIME
fi METRICS
''; '';
serviceConfig = { serviceConfig = {
Type = "oneshot"; Type = "oneshot";