{ config, lib, pkgs, ... }: let cfg = config.nfsServicesServer; in { options.nfsServicesServer = { enable = lib.mkEnableOption "NFS services server" // { default = true; }; standbys = lib.mkOption { type = lib.types.listOf lib.types.str; default = []; description = '' List of standby hostnames to replicate to (e.g. ["c1"]). Requires one-time setup on the NFS server: sudo mkdir -p /persist/root/.ssh sudo ssh-keygen -t ed25519 -f /persist/root/.ssh/btrfs-replication -N "" -C "root@$(hostname)-replication" Then add the public key to each standby's nfsServicesStandby.replicationKeys option. ''; }; }; config = lib.mkIf cfg.enable { # Persist root SSH directory for replication key environment.persistence.${config.custom.impermanence.persistPath} = { directories = [ "/root/.ssh" ]; }; # Bind mount /persist/services to /data/services for local access # This makes the path consistent with NFS clients # Use mkForce to override the NFS client mount from cluster-node.nix fileSystems."/data/services" = lib.mkForce { device = "/persist/services"; fsType = "none"; options = [ "bind" ]; }; # Nomad node metadata: mark this as the primary storage node # Jobs can constrain to ${meta.storage_role} = "primary" services.nomad.settings.client.meta = { storage_role = "primary"; }; # NFS server configuration services.nfs.server = { enable = true; exports = '' /persist/services 192.168.1.0/24(rw,sync,no_subtree_check,no_root_squash) ''; }; # Consul service registration for NFS services.consul.extraConfig.services = [{ name = "data-services"; port = 2049; checks = [{ tcp = "localhost:2049"; interval = "30s"; }]; }]; # Firewall for NFS networking.firewall.allowedTCPPorts = [ 2049 111 20048 ]; networking.firewall.allowedUDPPorts = [ 2049 111 20048 ]; # systemd services: NFS server split-brain check + replication services systemd.services = lib.mkMerge ([ # Safety check: prevent split-brain by ensuring no other NFS server is active { nfs-server = { preStart = '' # Wait for Consul to be available for i in {1..30}; do if ${pkgs.netcat}/bin/nc -z localhost 8600; then break fi echo "Waiting for Consul DNS... ($i/30)" sleep 1 done # Check if another NFS server is already registered in Consul CURRENT_SERVER=$(${pkgs.dnsutils}/bin/dig +short @localhost -p 8600 data-services.service.consul | head -1 || true) MY_IP=$(${pkgs.iproute2}/bin/ip -4 addr show | ${pkgs.gnugrep}/bin/grep -oP '(?<=inet\s)\d+(\.\d+){3}' | ${pkgs.gnugrep}/bin/grep -v '^127\.' | head -1) if [ -n "$CURRENT_SERVER" ] && [ "$CURRENT_SERVER" != "$MY_IP" ]; then echo "ERROR: Another NFS server is already active at $CURRENT_SERVER" echo "This host ($MY_IP) is configured as NFS server but should be standby." echo "To fix:" echo " 1. If this is intentional (failback), first demote the other server" echo " 2. Update this host's config to use nfs-services-standby.nix instead" echo " 3. Sync data from active server before promoting this host" exit 1 fi echo "NFS server startup check passed (no other active server found)" ''; }; } ] ++ (lib.forEach cfg.standbys (standby: { "replicate-services-to-${standby}" = { description = "Replicate /persist/services to ${standby}"; path = [ pkgs.btrfs-progs pkgs.openssh pkgs.coreutils pkgs.findutils pkgs.gnugrep pkgs.curl ]; script = '' set -euo pipefail START_TIME=$(date +%s) REPLICATION_SUCCESS=0 SSH_KEY="/persist/root/.ssh/btrfs-replication" if [ ! -f "$SSH_KEY" ]; then echo "ERROR: SSH key not found at $SSH_KEY" echo "Run: sudo ssh-keygen -t ed25519 -f $SSH_KEY -N \"\" -C \"root@$(hostname)-replication\"" exit 1 fi SNAPSHOT_NAME="services@$(date +%Y%m%d-%H%M%S)" SNAPSHOT_PATH="/persist/$SNAPSHOT_NAME" # Create readonly snapshot btrfs subvolume snapshot -r /persist/services "$SNAPSHOT_PATH" # Find previous snapshot on sender (sort by name since readonly snapshots have same mtime) # Use -d to list directories only, not their contents PREV_LOCAL=$(ls -1d /persist/services@* 2>/dev/null | grep -v "^$SNAPSHOT_PATH$" | sort -r | head -1 || true) # Try incremental send if we have a parent, fall back to full send if it fails if [ -n "$PREV_LOCAL" ]; then echo "Attempting incremental send from $(basename $PREV_LOCAL) to ${standby}" # Try incremental send, if it fails (e.g., parent missing on receiver), fall back to full if btrfs send -p "$PREV_LOCAL" "$SNAPSHOT_PATH" | \ ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \ "btrfs receive /persist/services-standby"; then echo "Incremental send completed successfully" REPLICATION_SUCCESS=1 else echo "Incremental send failed (likely missing parent on receiver), falling back to full send with clone source" # Use -c to specify clone source, maintaining parent relationship even in full send btrfs send -c "$PREV_LOCAL" "$SNAPSHOT_PATH" | \ ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \ "btrfs receive /persist/services-standby" REPLICATION_SUCCESS=1 fi else # First snapshot, do full send echo "Full send to ${standby} (first snapshot)" btrfs send "$SNAPSHOT_PATH" | \ ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \ "btrfs receive /persist/services-standby" REPLICATION_SUCCESS=1 fi # Cleanup old snapshots on sender (keep last 10 snapshots, sorted by name/timestamp) ls -1d /persist/services@* 2>/dev/null | sort | head -n -10 | xargs -r btrfs subvolume delete # Calculate metrics END_TIME=$(date +%s) DURATION=$((END_TIME - START_TIME)) SNAPSHOT_COUNT=$(ls -1d /persist/services@* 2>/dev/null | wc -l) # Push metrics to Prometheus pushgateway cat <