Only keep 10 snapshots, and push metrics.
This commit is contained in:
@@ -103,11 +103,14 @@ in
|
|||||||
] ++ (lib.forEach cfg.standbys (standby: {
|
] ++ (lib.forEach cfg.standbys (standby: {
|
||||||
"replicate-services-to-${standby}" = {
|
"replicate-services-to-${standby}" = {
|
||||||
description = "Replicate /persist/services to ${standby}";
|
description = "Replicate /persist/services to ${standby}";
|
||||||
path = [ pkgs.btrfs-progs pkgs.openssh pkgs.coreutils pkgs.findutils pkgs.gnugrep ];
|
path = [ pkgs.btrfs-progs pkgs.openssh pkgs.coreutils pkgs.findutils pkgs.gnugrep pkgs.curl ];
|
||||||
|
|
||||||
script = ''
|
script = ''
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
START_TIME=$(date +%s)
|
||||||
|
REPLICATION_SUCCESS=0
|
||||||
|
|
||||||
SSH_KEY="/persist/root/.ssh/btrfs-replication"
|
SSH_KEY="/persist/root/.ssh/btrfs-replication"
|
||||||
if [ ! -f "$SSH_KEY" ]; then
|
if [ ! -f "$SSH_KEY" ]; then
|
||||||
echo "ERROR: SSH key not found at $SSH_KEY"
|
echo "ERROR: SSH key not found at $SSH_KEY"
|
||||||
@@ -134,11 +137,13 @@ in
|
|||||||
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
|
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
|
||||||
"btrfs receive /persist/services-standby"; then
|
"btrfs receive /persist/services-standby"; then
|
||||||
echo "Incremental send completed successfully"
|
echo "Incremental send completed successfully"
|
||||||
|
REPLICATION_SUCCESS=1
|
||||||
else
|
else
|
||||||
echo "Incremental send failed (likely missing parent on receiver), falling back to full send"
|
echo "Incremental send failed (likely missing parent on receiver), falling back to full send"
|
||||||
btrfs send "$SNAPSHOT_PATH" | \
|
btrfs send "$SNAPSHOT_PATH" | \
|
||||||
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
|
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
|
||||||
"btrfs receive /persist/services-standby"
|
"btrfs receive /persist/services-standby"
|
||||||
|
REPLICATION_SUCCESS=1
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
# First snapshot, do full send
|
# First snapshot, do full send
|
||||||
@@ -146,10 +151,28 @@ in
|
|||||||
btrfs send "$SNAPSHOT_PATH" | \
|
btrfs send "$SNAPSHOT_PATH" | \
|
||||||
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
|
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
|
||||||
"btrfs receive /persist/services-standby"
|
"btrfs receive /persist/services-standby"
|
||||||
|
REPLICATION_SUCCESS=1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Cleanup old snapshots on sender (keep last 24 hours = 288 snapshots at 5min intervals)
|
# Cleanup old snapshots on sender (keep last 10 snapshots, sorted by name/timestamp)
|
||||||
find /persist -maxdepth 1 -name 'services@*' -mmin +1440 -exec btrfs subvolume delete {} \;
|
ls -1d /persist/services@* 2>/dev/null | sort | head -n -10 | xargs -r btrfs subvolume delete
|
||||||
|
|
||||||
|
# Calculate metrics
|
||||||
|
END_TIME=$(date +%s)
|
||||||
|
DURATION=$((END_TIME - START_TIME))
|
||||||
|
SNAPSHOT_COUNT=$(ls -1d /persist/services@* 2>/dev/null | wc -l)
|
||||||
|
|
||||||
|
# Push metrics to Prometheus pushgateway
|
||||||
|
cat <<METRICS | curl --data-binary @- http://pushgateway.service.consul:9091/metrics/job/nfs_replication/instance/${standby}
|
||||||
|
# TYPE nfs_replication_last_success_timestamp gauge
|
||||||
|
nfs_replication_last_success_timestamp $END_TIME
|
||||||
|
# TYPE nfs_replication_duration_seconds gauge
|
||||||
|
nfs_replication_duration_seconds $DURATION
|
||||||
|
# TYPE nfs_replication_snapshot_count gauge
|
||||||
|
nfs_replication_snapshot_count $SNAPSHOT_COUNT
|
||||||
|
# TYPE nfs_replication_success gauge
|
||||||
|
nfs_replication_success $REPLICATION_SUCCESS
|
||||||
|
METRICS
|
||||||
'';
|
'';
|
||||||
|
|
||||||
serviceConfig = {
|
serviceConfig = {
|
||||||
|
|||||||
@@ -39,26 +39,27 @@ in
|
|||||||
noCheck = true;
|
noCheck = true;
|
||||||
};
|
};
|
||||||
|
|
||||||
# Cleanup old snapshots on standby (keep last 4 hours for HA failover)
|
# Cleanup old snapshots on standby (keep last 10 snapshots)
|
||||||
systemd.services.cleanup-services-standby-snapshots = {
|
systemd.services.cleanup-services-standby-snapshots = {
|
||||||
description = "Cleanup old btrfs snapshots in services-standby";
|
description = "Cleanup old btrfs snapshots in services-standby";
|
||||||
path = [ pkgs.btrfs-progs pkgs.findutils pkgs.coreutils ];
|
path = [ pkgs.btrfs-progs pkgs.findutils pkgs.coreutils pkgs.curl ];
|
||||||
script = ''
|
script = ''
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
# Keep at least 2 hours of snapshots (24 snapshots at 5min intervals)
|
# Cleanup old snapshots on standby (keep last 10 snapshots, sorted by name/timestamp)
|
||||||
MIN_KEEP=24
|
ls -1d /persist/services-standby/services@* 2>/dev/null | sort | head -n -10 | xargs -r btrfs subvolume delete || true
|
||||||
|
|
||||||
# Count existing snapshots
|
# Calculate metrics
|
||||||
count=$(find /persist/services-standby -maxdepth 1 -name 'services@*' -type d | wc -l)
|
CLEANUP_TIME=$(date +%s)
|
||||||
|
SNAPSHOT_COUNT=$(ls -1d /persist/services-standby/services@* 2>/dev/null | wc -l)
|
||||||
|
|
||||||
# Only delete old snapshots if we have more than the minimum
|
# Push metrics to Prometheus pushgateway
|
||||||
if [ $count -gt $MIN_KEEP ]; then
|
cat <<METRICS | curl --data-binary @- http://pushgateway.service.consul:9091/metrics/job/nfs_standby_cleanup/instance/$(hostname)
|
||||||
# Delete snapshots older than 4 hours
|
# TYPE nfs_standby_snapshot_count gauge
|
||||||
find /persist/services-standby -maxdepth 1 -name 'services@*' -mmin +240 -exec btrfs subvolume delete {} \; || true
|
nfs_standby_snapshot_count $SNAPSHOT_COUNT
|
||||||
else
|
# TYPE nfs_standby_cleanup_last_run_timestamp gauge
|
||||||
echo "Only $count snapshots found, keeping all (minimum: $MIN_KEEP)"
|
nfs_standby_cleanup_last_run_timestamp $CLEANUP_TIME
|
||||||
fi
|
METRICS
|
||||||
'';
|
'';
|
||||||
serviceConfig = {
|
serviceConfig = {
|
||||||
Type = "oneshot";
|
Type = "oneshot";
|
||||||
|
|||||||
Reference in New Issue
Block a user