NFS server and client setup.

2025-10-22 13:06:21 +01:00
parent 1262e03e21
commit 967ff34a51
9 changed files with 739 additions and 2 deletions
--- a/common/cluster-node.nix
+++ b/common/cluster-node.nix
@@ -1,13 +1,14 @@
 { pkgs, ... }:
 {
  # Cluster node configuration
-  # Extends minimal-node with cluster-specific services (Consul, GlusterFS, CIFS)
+  # Extends minimal-node with cluster-specific services (Consul, GlusterFS, CIFS, NFS)
  # Used by: compute nodes (c1, c2, c3)
  imports = [
    ./minimal-node.nix
    ./unattended-encryption.nix
    ./cifs-client.nix
    ./consul.nix
-    ./glusterfs-client.nix
+    ./glusterfs-client.nix  # Keep during migration, will be removed in Phase 3
    ./nfs-services-client.nix  # New: NFS client for /data/services
  ];
 }
--- a/common/nfs-services-client.nix
+++ b/common/nfs-services-client.nix
@@ -0,0 +1,21 @@
 { pkgs, ... }:
 {
  # NFS client for /data/services
  # Mounts from data-services.service.consul (Consul DNS for automatic failover)
  # The NFS server registers itself in Consul, so this will automatically
  # point to whichever host is currently running the NFS server
  fileSystems."/data/services" = {
    device = "data-services.service.consul:/persist/services";
    fsType = "nfs";
    options = [
      "x-systemd.automount"  # Auto-mount on access
      "noauto"               # Don't mount at boot (automount handles it)
      "x-systemd.idle-timeout=60"  # Unmount after 60s of inactivity
      "_netdev"              # Network filesystem (wait for network)
    ];
  };
  # Ensure NFS client packages are available
  environment.systemPackages = [ pkgs.nfs-utils ];
 }
--- a/common/nfs-services-server.nix
+++ b/common/nfs-services-server.nix
@@ -0,0 +1,171 @@
 { config, lib, pkgs, ... }:
 let
  cfg = config.nfsServicesServer;
 in
 {
  options.nfsServicesServer = {
    enable = lib.mkEnableOption "NFS services server" // { default = true; };
    standbys = lib.mkOption {
      type = lib.types.listOf lib.types.str;
      default = [];
      description = ''
        List of standby hostnames to replicate to (e.g. ["c1"]).
        Requires one-time setup on the NFS server:
          sudo mkdir -p /persist/root/.ssh
          sudo ssh-keygen -t ed25519 -f /persist/root/.ssh/btrfs-replication -N "" -C "root@$(hostname)-replication"
        Then add the public key to each standby's nfsServicesStandby.replicationKeys option.
      '';
    };
  };
  config = lib.mkIf cfg.enable {
    # Persist root SSH directory for replication key
    environment.persistence."/persist" = {
      directories = [
        "/root/.ssh"
      ];
    };
    # Bind mount /persist/services to /data/services for local access
    # This makes the path consistent with NFS clients
    # Use mkForce to override the NFS client mount from cluster-node.nix
    fileSystems."/data/services" = lib.mkForce {
      device = "/persist/services";
      fsType = "none";
      options = [ "bind" ];
    };
    # Nomad node metadata: mark this as the primary storage node
    # Jobs can constrain to ${meta.storage_role} = "primary"
    services.nomad.settings.client.meta = {
      storage_role = "primary";
    };
    # NFS server configuration
    services.nfs.server = {
      enable = true;
      exports = ''
        /persist/services 192.168.1.0/24(rw,sync,no_subtree_check,no_root_squash)
      '';
    };
    # Consul service registration for NFS
    services.consul.extraConfig.services = [{
      name = "data-services";
      port = 2049;
      checks = [{
        tcp = "localhost:2049";
        interval = "30s";
      }];
    }];
    # Firewall for NFS
    networking.firewall.allowedTCPPorts = [ 2049 111 20048 ];
    networking.firewall.allowedUDPPorts = [ 2049 111 20048 ];
    # systemd services: NFS server split-brain check + replication services
    systemd.services = lib.mkMerge ([
      # Safety check: prevent split-brain by ensuring no other NFS server is active
      {
        nfs-server = {
          preStart = ''
            # Wait for Consul to be available
            for i in {1..30}; do
              if ${pkgs.netcat}/bin/nc -z localhost 8600; then
                break
              fi
              echo "Waiting for Consul DNS... ($i/30)"
              sleep 1
            done
            # Check if another NFS server is already registered in Consul
            CURRENT_SERVER=$(${pkgs.dnsutils}/bin/dig +short @localhost -p 8600 data-services.service.consul | head -1 || true)
            MY_IP=$(${pkgs.iproute2}/bin/ip -4 addr show | ${pkgs.gnugrep}/bin/grep -oP '(?<=inet\s)\d+(\.\d+){3}' | ${pkgs.gnugrep}/bin/grep -v '^127\.' | head -1)
            if [ -n "$CURRENT_SERVER" ] && [ "$CURRENT_SERVER" != "$MY_IP" ]; then
              echo "ERROR: Another NFS server is already active at $CURRENT_SERVER"
              echo "This host ($MY_IP) is configured as NFS server but should be standby."
              echo "To fix:"
              echo "  1. If this is intentional (failback), first demote the other server"
              echo "  2. Update this host's config to use nfs-services-standby.nix instead"
              echo "  3. Sync data from active server before promoting this host"
              exit 1
            fi
            echo "NFS server startup check passed (no other active server found)"
          '';
        };
      }
    ] ++ (lib.forEach cfg.standbys (standby: {
        "replicate-services-to-${standby}" = {
          description = "Replicate /persist/services to ${standby}";
          path = [ pkgs.btrfs-progs pkgs.openssh pkgs.coreutils pkgs.findutils pkgs.gnugrep ];
          script = ''
            set -euo pipefail
            SSH_KEY="/persist/root/.ssh/btrfs-replication"
            if [ ! -f "$SSH_KEY" ]; then
              echo "ERROR: SSH key not found at $SSH_KEY"
              echo "Run: sudo ssh-keygen -t ed25519 -f $SSH_KEY -N \"\" -C \"root@$(hostname)-replication\""
              exit 1
            fi
            SNAPSHOT_NAME="services@$(date +%Y%m%d-%H%M%S)"
            SNAPSHOT_PATH="/persist/$SNAPSHOT_NAME"
            # Create readonly snapshot
            btrfs subvolume snapshot -r /persist/services "$SNAPSHOT_PATH"
            # Find previous snapshot on sender
            PREV_LOCAL=$(ls -t /persist/services@* 2>/dev/null | grep -v "^$SNAPSHOT_PATH$" | head -1 || true)
            # Check what snapshots exist on the receiver
            REMOTE_SNAPSHOTS=$(ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
              "ls -t /persist/services-standby/services@* 2>/dev/null || true")
            # Decide: incremental or full send
            if [ -n "$PREV_LOCAL" ] && echo "$REMOTE_SNAPSHOTS" | grep -q "$(basename "$PREV_LOCAL")"; then
              # Receiver has the parent snapshot, do incremental
              echo "Incremental send from $(basename $PREV_LOCAL) to ${standby}"
              btrfs send -p "$PREV_LOCAL" "$SNAPSHOT_PATH" | \
                ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
                "btrfs receive /persist/services-standby"
            else
              # Receiver doesn't have parent (new standby or missing snapshot), do full send
              echo "Full send to ${standby} (new standby or parent snapshot not found on receiver)"
              btrfs send "$SNAPSHOT_PATH" | \
                ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
                "btrfs receive /persist/services-standby"
            fi
            # Cleanup old snapshots on sender (keep last 24 hours = 288 snapshots at 5min intervals)
            find /persist -maxdepth 1 -name 'services@*' -mmin +1440 -exec btrfs subvolume delete {} \;
          '';
          serviceConfig = {
            Type = "oneshot";
            User = "root";
          };
        };
      }))
    );
    systemd.timers = lib.mkMerge (
      lib.forEach cfg.standbys (standby: {
        "replicate-services-to-${standby}" = {
          description = "Timer for replicating /persist/services to ${standby}";
          wantedBy = [ "timers.target" ];
          timerConfig = {
            OnCalendar = "*:0/5";  # Every 5 minutes
            Persistent = true;
          };
        };
      })
    );
  };
 }
--- a/common/nfs-services-standby.nix
+++ b/common/nfs-services-standby.nix
@@ -0,0 +1,68 @@
 { config, lib, pkgs, ... }:
 let
  cfg = config.nfsServicesStandby;
 in
 {
  options.nfsServicesStandby = {
    enable = lib.mkEnableOption "NFS services standby" // { default = true; };
    replicationKeys = lib.mkOption {
      type = lib.types.listOf lib.types.str;
      default = [];
      description = ''
        SSH public keys authorized to replicate btrfs snapshots to this standby.
        These keys are restricted to only run 'btrfs receive /persist/services-standby'.
        Get the public key from the NFS server:
          ssh <nfs-server> sudo cat /persist/root/.ssh/btrfs-replication.pub
      '';
    };
  };
  config = lib.mkIf cfg.enable {
    # Allow root SSH login for replication (restricted by command= in authorized_keys)
    # This is configured in common/sshd.nix
    # Restricted SSH keys for btrfs replication
    users.users.root.openssh.authorizedKeys.keys =
      map (key: ''command="btrfs receive /persist/services-standby",restrict ${key}'') cfg.replicationKeys;
    # Mount point for services-standby subvolume
    # This is just declarative documentation - the subvolume must be created manually once:
    #   sudo btrfs subvolume create /persist/services-standby
    # After that, it will persist across reboots (it's under /persist)
    fileSystems."/persist/services-standby" = {
      device = "/persist/services-standby";
      fsType = "none";
      options = [ "bind" ];
      noCheck = true;
    };
    # Cleanup old snapshots on standby (keep last 48 hours for safety)
    systemd.services.cleanup-services-standby-snapshots = {
      description = "Cleanup old btrfs snapshots in services-standby";
      path = [ pkgs.btrfs-progs pkgs.findutils ];
      script = ''
        set -euo pipefail
        # Keep last 48 hours of snapshots (576 snapshots at 5min intervals)
        find /persist/services-standby -maxdepth 1 -name 'services@*' -mmin +2880 -exec btrfs subvolume delete {} \; || true
      '';
      serviceConfig = {
        Type = "oneshot";
        User = "root";
      };
    };
    systemd.timers.cleanup-services-standby-snapshots = {
      description = "Timer for cleaning up old snapshots on standby";
      wantedBy = [ "timers.target" ];
      timerConfig = {
        OnCalendar = "daily";
        Persistent = true;
      };
    };
  };
 }
--- a/common/sshd.nix
+++ b/common/sshd.nix
@@ -5,6 +5,7 @@
    settings = {
      PasswordAuthentication = false;
      KbdInteractiveAuthentication = false;
      PermitRootLogin = "prohibit-password";  # Allow root login with SSH keys only
    };
  };
--- a/docs/NFS_FAILOVER.md
+++ b/docs/NFS_FAILOVER.md
@@ -0,0 +1,438 @@
 # NFS Services Failover Procedures
 This document describes how to fail over the `/data/services` NFS server between hosts and how to fail back.
 ## Architecture Overview
 - **Primary NFS Server**: Typically `zippy`
  - Exports `/persist/services` via NFS
  - Has local bind mount: `/data/services` → `/persist/services` (same path as clients)
  - Registers `data-services.service.consul` in Consul
  - Sets Nomad node meta: `storage_role = "primary"`
  - Replicates snapshots to standbys every 5 minutes via btrfs send
  - **Safety check**: Refuses to start if another NFS server is already active in Consul
 - **Standby**: Typically `c1`
  - Receives snapshots at `/persist/services-standby/services@<timestamp>`
  - Can be promoted to NFS server during failover
  - No special Nomad node meta (not primary)
 - **Clients**: All cluster nodes (c1, c2, c3, zippy)
  - Mount `/data/services` from `data-services.service.consul:/persist/services`
  - Automatically connect to whoever is registered in Consul
 ### Nomad Job Constraints
 Jobs that need to run on the primary storage node should use:
 ```hcl
 constraint {
  attribute = "${meta.storage_role}"
  value     = "primary"
 }
 ```
 This is useful for:
 - Database jobs (mysql, postgres, redis) that benefit from local storage
 - Jobs that need guaranteed fast disk I/O
 During failover, the `storage_role = "primary"` meta attribute moves to the new NFS server, and Nomad automatically reschedules constrained jobs to the new primary.
 ## Prerequisites
 - Standby has been receiving snapshots (check: `ls /persist/services-standby/services@*`)
 - Last successful replication was recent (< 5-10 minutes)
 ---
 ## Failover: Promoting Standby to Primary
 **Scenario**: `zippy` is down and you need to promote `c1` to be the NFS server.
 ### Step 1: Choose Latest Snapshot
 On the standby (c1):
 ```bash
 ssh c1
 sudo ls -lt /persist/services-standby/services@* | head -5
 ```
 Find the most recent snapshot. Note the timestamp to estimate data loss (typically < 5 minutes).
 ### Step 2: Promote Snapshot to Read-Write Subvolume
 On c1:
 ```bash
 # Find the latest snapshot
 LATEST=$(sudo ls -t /persist/services-standby/services@* | head -1)
 # Create writable subvolume from snapshot
 sudo btrfs subvolume snapshot "$LATEST" /persist/services
 # Verify
 ls -la /persist/services
 ```
 ### Step 3: Update NixOS Configuration
 Edit your configuration to swap the NFS server role:
 **In `hosts/c1/default.nix`**:
 ```nix
 imports = [
  # ... existing imports ...
  # ../../common/nfs-services-standby.nix  # REMOVE THIS
  ../../common/nfs-services-server.nix     # ADD THIS
 ];
 # Add standbys if desired (optional - can leave empty during emergency)
 nfsServicesServer.standbys = [];  # Or ["c2"] to add a new standby
 ```
 **Optional: Prepare zippy config for when it comes back**:
 In `hosts/zippy/default.nix` (can do this later too):
 ```nix
 imports = [
  # ... existing imports ...
  # ../../common/nfs-services-server.nix   # REMOVE THIS
  ../../common/nfs-services-standby.nix    # ADD THIS
 ];
 # Add the replication key from c1 (get it from c1:/persist/root/.ssh/btrfs-replication.pub)
 nfsServicesStandby.replicationKeys = [
  "ssh-ed25519 AAAA... root@c1-replication"
 ];
 ```
 ### Step 4: Deploy Configuration
 ```bash
 # From your workstation
 deploy -s '.#c1'
 # If zippy is still down, updating its config will fail, but that's okay
 # You can update it later when it comes back
 ```
 ### Step 5: Verify NFS Server is Running
 On c1:
 ```bash
 sudo systemctl status nfs-server
 sudo showmount -e localhost
 dig @localhost -p 8600 data-services.service.consul  # Should show c1's IP
 ```
 ### Step 6: Verify Clients Can Access
 From any node:
 ```bash
 df -h | grep services
 ls /data/services
 ```
 The mount should automatically reconnect via Consul DNS.
 ### Step 7: Check Nomad Jobs
 ```bash
 nomad job status mysql
 nomad job status postgres
 # Verify critical services are healthy
 # Jobs constrained to ${meta.storage_role} = "primary" will automatically
 # reschedule to c1 once it's deployed with the NFS server module
 ```
 **Recovery Time Objective (RTO)**: ~10-15 minutes
 **Recovery Point Objective (RPO)**: Last replication interval (5 minutes max)
 **Note**: Jobs with the `storage_role = "primary"` constraint will automatically move to c1 because it now has that node meta attribute. No job spec changes needed!
 ---
 ## What Happens When zippy Comes Back?
 **IMPORTANT**: If zippy reboots while still configured as NFS server, it will **refuse to start** the NFS service because it detects c1 is already active in Consul.
 You'll see this error in `journalctl -u nfs-server`:
 ```
 ERROR: Another NFS server is already active at 192.168.1.X
 This host (192.168.1.2) is configured as NFS server but should be standby.
 To fix:
  1. If this is intentional (failback), first demote the other server
  2. Update this host's config to use nfs-services-standby.nix instead
  3. Sync data from active server before promoting this host
 ```
 This is a **safety feature** to prevent split-brain and data corruption.
 ### Options when zippy comes back:
 **Option A: Keep c1 as primary** (zippy becomes standby)
 1. Update zippy's config to use `nfs-services-standby.nix`
 2. Deploy to zippy
 3. c1 will start replicating to zippy
 **Option B: Fail back to zippy as primary**
 Follow the "Failing Back to Original Primary" procedure below.
 ---
 ## Failing Back to Original Primary
 **Scenario**: `zippy` is repaired and you want to move the NFS server role back from `c1` to `zippy`.
 ### Step 1: Sync Latest Data from c1 to zippy
 On c1 (current primary):
 ```bash
 # Create readonly snapshot of current state
 sudo btrfs subvolume snapshot -r /persist/services /persist/services@failback-$(date +%Y%m%d-%H%M%S)
 # Find the snapshot
 FAILBACK=$(sudo ls -t /persist/services@failback-* | head -1)
 # Send to zippy (use root SSH key if available, or generate temporary key)
 sudo btrfs send "$FAILBACK" | ssh root@zippy "btrfs receive /persist/"
 ```
 On zippy:
 ```bash
 # Verify snapshot arrived
 ls -la /persist/services@failback-*
 # Create writable subvolume from the snapshot
 FAILBACK=$(ls -t /persist/services@failback-* | head -1)
 sudo btrfs subvolume snapshot "$FAILBACK" /persist/services
 # Verify
 ls -la /persist/services
 ```
 ### Step 2: Update NixOS Configuration
 Swap the roles back:
 **In `hosts/zippy/default.nix`**:
 ```nix
 imports = [
  # ... existing imports ...
  # ../../common/nfs-services-standby.nix  # REMOVE THIS
  ../../common/nfs-services-server.nix     # ADD THIS
 ];
 nfsServicesServer.standbys = ["c1"];
 ```
 **In `hosts/c1/default.nix`**:
 ```nix
 imports = [
  # ... existing imports ...
  # ../../common/nfs-services-server.nix   # REMOVE THIS
  ../../common/nfs-services-standby.nix    # ADD THIS
 ];
 nfsServicesStandby.replicationKeys = [
  "ssh-ed25519 AAAA... root@zippy-replication"  # Get from zippy:/persist/root/.ssh/btrfs-replication.pub
 ];
 ```
 ### Step 3: Deploy Configurations
 ```bash
 # IMPORTANT: Deploy c1 FIRST to demote it
 deploy -s '.#c1'
 # Wait for c1 to stop NFS server
 ssh c1 sudo systemctl status nfs-server  # Should be inactive
 # Then deploy zippy to promote it
 deploy -s '.#zippy'
 ```
 The order matters! If you deploy zippy first, it will see c1 is still active and refuse to start.
 ### Step 4: Verify Failback
 Check Consul DNS points to zippy:
 ```bash
 dig @c1 -p 8600 data-services.service.consul  # Should show zippy's IP
 ```
 Check clients are mounting from zippy:
 ```bash
 for host in c1 c2 c3; do
  ssh $host "df -h | grep services"
 done
 ```
 ### Step 5: Clean Up Temporary Snapshots
 On c1:
 ```bash
 # Remove the failback snapshot and the promoted subvolume
 sudo btrfs subvolume delete /persist/services@failback-*
 sudo btrfs subvolume delete /persist/services
 ```
 ---
 ## Adding a New Standby
 **Scenario**: You want to add `c2` as an additional standby.
 ### Step 1: Create Standby Subvolume on c2
 ```bash
 ssh c2
 sudo btrfs subvolume create /persist/services-standby
 ```
 ### Step 2: Update c2 Configuration
 **In `hosts/c2/default.nix`**:
 ```nix
 imports = [
  # ... existing imports ...
  ../../common/nfs-services-standby.nix
 ];
 nfsServicesStandby.replicationKeys = [
  "ssh-ed25519 AAAA... root@zippy-replication"  # Get from current NFS server
 ];
 ```
 ### Step 3: Update NFS Server Configuration
 On the current NFS server (e.g., zippy), update the standbys list:
 **In `hosts/zippy/default.nix`**:
 ```nix
 nfsServicesServer.standbys = ["c1" "c2"];  # Added c2
 ```
 ### Step 4: Deploy
 ```bash
 deploy -s '.#c2'
 deploy -s '.#zippy'
 ```
 The next replication cycle (within 5 minutes) will do a full send to c2, then switch to incremental.
 ---
 ## Troubleshooting
 ### Replication Failed
 Check the replication service logs:
 ```bash
 # On NFS server
 sudo journalctl -u replicate-services-to-c1 -f
 ```
 Common issues:
 - SSH key not found → Run key generation step (see stateful-commands.txt)
 - Permission denied → Check authorized_keys on standby
 - Snapshot already exists → Old snapshot with same timestamp, wait for next cycle
 ### Clients Can't Mount
 Check Consul:
 ```bash
 dig @localhost -p 8600 data-services.service.consul
 consul catalog services | grep data-services
 ```
 If Consul isn't resolving:
 - NFS server might not have registered → Check `sudo systemctl status nfs-server`
 - Consul agent might be down → Check `sudo systemctl status consul`
 ### Mount is Stale
 Force remount:
 ```bash
 sudo systemctl restart data-services.mount
 ```
 Or unmount and let automount handle it:
 ```bash
 sudo umount /data/services
 ls /data/services  # Triggers automount
 ```
 ### Split-Brain Prevention: NFS Server Won't Start
 If you see:
 ```
 ERROR: Another NFS server is already active at 192.168.1.X
 ```
 This is **intentional** - the safety check is working! You have two options:
 1. **Keep the other server as primary**: Update this host's config to be a standby instead
 2. **Fail back to this host**: First demote the other server, sync data, then deploy both hosts in correct order
 ---
 ## Monitoring
 ### Check Replication Status
 On NFS server:
 ```bash
 # List recent snapshots
 ls -lt /persist/services@* | head
 # Check last replication run
 sudo systemctl status replicate-services-to-c1
 # Check replication logs
 sudo journalctl -u replicate-services-to-c1 --since "1 hour ago"
 ```
 On standby:
 ```bash
 # List received snapshots
 ls -lt /persist/services-standby/services@* | head
 # Check how old the latest snapshot is
 stat /persist/services-standby/services@* | grep Modify | head -1
 ```
 ### Verify NFS Exports
 ```bash
 sudo showmount -e localhost
 ```
 Should show:
 ```
 /persist/services 192.168.1.0/24
 ```
 ### Check Consul Registration
 ```bash
 consul catalog services | grep data-services
 dig @localhost -p 8600 data-services.service.consul
 ```
--- a/hosts/c1/default.nix
+++ b/hosts/c1/default.nix
@@ -4,6 +4,11 @@
    ../../common/encrypted-btrfs-layout.nix
    ../../common/global
    ../../common/compute-node.nix
    ../../common/nfs-services-standby.nix  # NFS standby for /data/services
    # To promote to NFS server (during failover):
    # 1. Follow procedure in docs/NFS_FAILOVER.md
    # 2. Replace above line with: ../../common/nfs-services-server.nix
    # 3. Add nfsServicesServer.standbys = [ "c2" ]; (or leave empty)
    ./hardware.nix
  ];
@@ -15,4 +20,9 @@
  networking.hostName = "c1";
  services.tailscaleAutoconnect.authkey = "tskey-auth-k2nQ771YHM11CNTRL-YVpoumL2mgR6nLPG51vNhRpEKMDN7gLAi";
  # NFS standby configuration: accept replication from zippy
  nfsServicesStandby.replicationKeys = [
    "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHyTKsMCbwCIlMcC/aopgz5Yfx/Q9QdlWC9jzMLgYFAV root@zippy-replication"
  ];
 }
--- a/hosts/zippy/default.nix
+++ b/hosts/zippy/default.nix
@@ -5,6 +5,11 @@
    ../../common/global
    ../../common/compute-node.nix
 #    ../../common/ethereum.nix
    ../../common/nfs-services-server.nix  # NFS server for /data/services
    # To move NFS server role to another host:
    # 1. Follow procedure in docs/NFS_FAILOVER.md
    # 2. Replace above line with: ../../common/nfs-services-standby.nix
    # 3. Add nfsServicesStandby.replicationKeys with the new server's public key
    ./hardware.nix
  ];
@@ -16,4 +21,7 @@
  networking.hostName = "zippy";
  services.tailscaleAutoconnect.authkey = "tskey-auth-ktKyQ59f2p11CNTRL-ut8E71dLWPXsVtb92hevNX9RTjmk4owBf";
  # NFS server configuration: replicate to c1 as standby
  nfsServicesServer.standbys = [ "c1" ];
 }
--- a/stateful-commands.txt
+++ b/stateful-commands.txt
@@ -39,3 +39,22 @@ kopia repository server setup (on a non-NixOS host at the time):
  * kopia server start --address 0.0.0.0:51515 --tls-cert-file ~/kopia-certs/kopia.cert --tls-key-file ~/kopia-certs/kopia.key --tls-generate-cert (first time)
  * kopia server start --address 0.0.0.0:51515 --tls-cert-file ~/kopia-certs/kopia.cert --tls-key-file ~/kopia-certs/kopia.key (subsequent)
 [TLS is mandatory for this]
 NFS services server setup (one-time on the NFS server host, e.g. zippy):
  * sudo btrfs subvolume create /persist/services
  * sudo mkdir -p /persist/root/.ssh
  * sudo ssh-keygen -t ed25519 -f /persist/root/.ssh/btrfs-replication -N "" -C "root@$(hostname)-replication"
  * Get the public key: sudo cat /persist/root/.ssh/btrfs-replication.pub
    Then add this public key to each standby's nfsServicesStandby.replicationKeys option
 NFS services standby setup (one-time on each standby host, e.g. c1):
  * sudo btrfs subvolume create /persist/services-standby
 Moving NFS server role between hosts (e.g. from zippy to c1):
  See docs/NFS_FAILOVER.md for detailed procedure
  Summary:
  1. On current primary: create final snapshot and send to new primary
  2. On new primary: promote snapshot to /persist/services
  3. Update configs: remove nfs-services-server.nix from old primary, add to new primary
  4. Update configs: add nfs-services-standby.nix to old primary (with replication keys)
  5. Deploy old primary first (to demote), then new primary (to promote)