diff --git a/common/nfs-services-server.nix b/common/nfs-services-server.nix
index f0848e6..842ffd4 100644
--- a/common/nfs-services-server.nix
+++ b/common/nfs-services-server.nix
@@ -103,11 +103,14 @@ in
     ] ++ (lib.forEach cfg.standbys (standby: {
         "replicate-services-to-${standby}" = {
           description = "Replicate /persist/services to ${standby}";
-          path = [ pkgs.btrfs-progs pkgs.openssh pkgs.coreutils pkgs.findutils pkgs.gnugrep ];
+          path = [ pkgs.btrfs-progs pkgs.openssh pkgs.coreutils pkgs.findutils pkgs.gnugrep pkgs.curl ];
 
           script = ''
             set -euo pipefail
 
+            START_TIME=$(date +%s)
+            REPLICATION_SUCCESS=0
+
             SSH_KEY="/persist/root/.ssh/btrfs-replication"
             if [ ! -f "$SSH_KEY" ]; then
               echo "ERROR: SSH key not found at $SSH_KEY"
@@ -134,11 +137,13 @@ in
                  ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
                  "btrfs receive /persist/services-standby"; then
                 echo "Incremental send completed successfully"
+                REPLICATION_SUCCESS=1
               else
                 echo "Incremental send failed (likely missing parent on receiver), falling back to full send"
                 btrfs send "$SNAPSHOT_PATH" | \
                   ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
                   "btrfs receive /persist/services-standby"
+                REPLICATION_SUCCESS=1
               fi
             else
               # First snapshot, do full send
@@ -146,10 +151,28 @@ in
               btrfs send "$SNAPSHOT_PATH" | \
                 ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
                 "btrfs receive /persist/services-standby"
+              REPLICATION_SUCCESS=1
             fi
 
-            # Cleanup old snapshots on sender (keep last 24 hours = 288 snapshots at 5min intervals)
-            find /persist -maxdepth 1 -name 'services@*' -mmin +1440 -exec btrfs subvolume delete {} \;
+            # Cleanup old snapshots on sender (keep last 10 snapshots, sorted by name/timestamp)
+            ls -1d /persist/services@* 2>/dev/null | sort | head -n -10 | xargs -r btrfs subvolume delete
+
+            # Calculate metrics
+            END_TIME=$(date +%s)
+            DURATION=$((END_TIME - START_TIME))
+            SNAPSHOT_COUNT=$(ls -1d /persist/services@* 2>/dev/null | wc -l)
+
+            # Push metrics to Prometheus pushgateway
+            cat <<METRICS | curl --data-binary @- http://pushgateway.service.consul:9091/metrics/job/nfs_replication/instance/${standby}
+            # TYPE nfs_replication_last_success_timestamp gauge
+            nfs_replication_last_success_timestamp $END_TIME
+            # TYPE nfs_replication_duration_seconds gauge
+            nfs_replication_duration_seconds $DURATION
+            # TYPE nfs_replication_snapshot_count gauge
+            nfs_replication_snapshot_count $SNAPSHOT_COUNT
+            # TYPE nfs_replication_success gauge
+            nfs_replication_success $REPLICATION_SUCCESS
+            METRICS
           '';
 
           serviceConfig = {
diff --git a/common/nfs-services-standby.nix b/common/nfs-services-standby.nix
index d312b46..738a75f 100644
--- a/common/nfs-services-standby.nix
+++ b/common/nfs-services-standby.nix
@@ -39,26 +39,27 @@ in
       noCheck = true;
     };
 
-    # Cleanup old snapshots on standby (keep last 4 hours for HA failover)
+    # Cleanup old snapshots on standby (keep last 10 snapshots)
     systemd.services.cleanup-services-standby-snapshots = {
       description = "Cleanup old btrfs snapshots in services-standby";
-      path = [ pkgs.btrfs-progs pkgs.findutils pkgs.coreutils ];
+      path = [ pkgs.btrfs-progs pkgs.findutils pkgs.coreutils pkgs.curl ];
       script = ''
         set -euo pipefail
 
-        # Keep at least 2 hours of snapshots (24 snapshots at 5min intervals)
-        MIN_KEEP=24
+        # Cleanup old snapshots on standby (keep last 10 snapshots, sorted by name/timestamp)
+        ls -1d /persist/services-standby/services@* 2>/dev/null | sort | head -n -10 | xargs -r btrfs subvolume delete || true
 
-        # Count existing snapshots
-        count=$(find /persist/services-standby -maxdepth 1 -name 'services@*' -type d | wc -l)
+        # Calculate metrics
+        CLEANUP_TIME=$(date +%s)
+        SNAPSHOT_COUNT=$(ls -1d /persist/services-standby/services@* 2>/dev/null | wc -l)
 
-        # Only delete old snapshots if we have more than the minimum
-        if [ $count -gt $MIN_KEEP ]; then
-          # Delete snapshots older than 4 hours
-          find /persist/services-standby -maxdepth 1 -name 'services@*' -mmin +240 -exec btrfs subvolume delete {} \; || true
-        else
-          echo "Only $count snapshots found, keeping all (minimum: $MIN_KEEP)"
-        fi
+        # Push metrics to Prometheus pushgateway
+        cat <<METRICS | curl --data-binary @- http://pushgateway.service.consul:9091/metrics/job/nfs_standby_cleanup/instance/$(hostname)
+        # TYPE nfs_standby_snapshot_count gauge
+        nfs_standby_snapshot_count $SNAPSHOT_COUNT
+        # TYPE nfs_standby_cleanup_last_run_timestamp gauge
+        nfs_standby_cleanup_last_run_timestamp $CLEANUP_TIME
+        METRICS
       '';
       serviceConfig = {
         Type = "oneshot";