Files
alo-cluster/common/nfs-services-standby.nix

80 lines
2.9 KiB
Nix

{ config, lib, pkgs, ... }:
let
cfg = config.nfsServicesStandby;
in
{
options.nfsServicesStandby = {
enable = lib.mkEnableOption "NFS services standby" // { default = true; };
replicationKeys = lib.mkOption {
type = lib.types.listOf lib.types.str;
default = [];
description = ''
SSH public keys authorized to replicate btrfs snapshots to this standby.
These keys are restricted to only run 'btrfs receive /persist/services-standby'.
Get the public key from the NFS server:
ssh <nfs-server> sudo cat /persist/root/.ssh/btrfs-replication.pub
'';
};
};
config = lib.mkIf cfg.enable {
# Allow root SSH login for replication (restricted by command= in authorized_keys)
# This is configured in common/sshd.nix
# Restricted SSH keys for btrfs replication
users.users.root.openssh.authorizedKeys.keys =
map (key: ''command="btrfs receive /persist/services-standby",restrict ${key}'') cfg.replicationKeys;
# Mount point for services-standby subvolume
# This is just declarative documentation - the subvolume must be created manually once:
# sudo btrfs subvolume create /persist/services-standby
# After that, it will persist across reboots (it's under /persist)
fileSystems."/persist/services-standby" = {
device = "/persist/services-standby";
fsType = "none";
options = [ "bind" ];
noCheck = true;
};
# Cleanup old snapshots on standby (keep last 10 snapshots)
systemd.services.cleanup-services-standby-snapshots = {
description = "Cleanup old btrfs snapshots in services-standby";
path = [ pkgs.btrfs-progs pkgs.findutils pkgs.coreutils pkgs.curl ];
script = ''
set -euo pipefail
# Cleanup old snapshots on standby (keep last 10 snapshots, sorted by name/timestamp)
ls -1d /persist/services-standby/services@* 2>/dev/null | sort | head -n -10 | xargs -r btrfs subvolume delete || true
# Calculate metrics
CLEANUP_TIME=$(date +%s)
SNAPSHOT_COUNT=$(ls -1d /persist/services-standby/services@* 2>/dev/null | wc -l)
# Push metrics to Prometheus pushgateway
cat <<METRICS | curl --data-binary @- http://pushgateway.service.consul:9091/metrics/job/nfs_standby_cleanup/instance/$(hostname)
# TYPE nfs_standby_snapshot_count gauge
nfs_standby_snapshot_count $SNAPSHOT_COUNT
# TYPE nfs_standby_cleanup_last_run_timestamp gauge
nfs_standby_cleanup_last_run_timestamp $CLEANUP_TIME
METRICS
'';
serviceConfig = {
Type = "oneshot";
User = "root";
};
};
systemd.timers.cleanup-services-standby-snapshots = {
description = "Timer for cleaning up old snapshots on standby";
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "hourly";
Persistent = true;
};
};
};
}