NFS server and client setup.

This commit is contained in:
2025-10-22 13:06:21 +01:00
parent 1262e03e21
commit 967ff34a51
9 changed files with 739 additions and 2 deletions

View File

@@ -1,13 +1,14 @@
{ pkgs, ... }: { pkgs, ... }:
{ {
# Cluster node configuration # Cluster node configuration
# Extends minimal-node with cluster-specific services (Consul, GlusterFS, CIFS) # Extends minimal-node with cluster-specific services (Consul, GlusterFS, CIFS, NFS)
# Used by: compute nodes (c1, c2, c3) # Used by: compute nodes (c1, c2, c3)
imports = [ imports = [
./minimal-node.nix ./minimal-node.nix
./unattended-encryption.nix ./unattended-encryption.nix
./cifs-client.nix ./cifs-client.nix
./consul.nix ./consul.nix
./glusterfs-client.nix ./glusterfs-client.nix # Keep during migration, will be removed in Phase 3
./nfs-services-client.nix # New: NFS client for /data/services
]; ];
} }

View File

@@ -0,0 +1,21 @@
{ pkgs, ... }:
{
# NFS client for /data/services
# Mounts from data-services.service.consul (Consul DNS for automatic failover)
# The NFS server registers itself in Consul, so this will automatically
# point to whichever host is currently running the NFS server
fileSystems."/data/services" = {
device = "data-services.service.consul:/persist/services";
fsType = "nfs";
options = [
"x-systemd.automount" # Auto-mount on access
"noauto" # Don't mount at boot (automount handles it)
"x-systemd.idle-timeout=60" # Unmount after 60s of inactivity
"_netdev" # Network filesystem (wait for network)
];
};
# Ensure NFS client packages are available
environment.systemPackages = [ pkgs.nfs-utils ];
}

View File

@@ -0,0 +1,171 @@
{ config, lib, pkgs, ... }:
let
cfg = config.nfsServicesServer;
in
{
options.nfsServicesServer = {
enable = lib.mkEnableOption "NFS services server" // { default = true; };
standbys = lib.mkOption {
type = lib.types.listOf lib.types.str;
default = [];
description = ''
List of standby hostnames to replicate to (e.g. ["c1"]).
Requires one-time setup on the NFS server:
sudo mkdir -p /persist/root/.ssh
sudo ssh-keygen -t ed25519 -f /persist/root/.ssh/btrfs-replication -N "" -C "root@$(hostname)-replication"
Then add the public key to each standby's nfsServicesStandby.replicationKeys option.
'';
};
};
config = lib.mkIf cfg.enable {
# Persist root SSH directory for replication key
environment.persistence."/persist" = {
directories = [
"/root/.ssh"
];
};
# Bind mount /persist/services to /data/services for local access
# This makes the path consistent with NFS clients
# Use mkForce to override the NFS client mount from cluster-node.nix
fileSystems."/data/services" = lib.mkForce {
device = "/persist/services";
fsType = "none";
options = [ "bind" ];
};
# Nomad node metadata: mark this as the primary storage node
# Jobs can constrain to ${meta.storage_role} = "primary"
services.nomad.settings.client.meta = {
storage_role = "primary";
};
# NFS server configuration
services.nfs.server = {
enable = true;
exports = ''
/persist/services 192.168.1.0/24(rw,sync,no_subtree_check,no_root_squash)
'';
};
# Consul service registration for NFS
services.consul.extraConfig.services = [{
name = "data-services";
port = 2049;
checks = [{
tcp = "localhost:2049";
interval = "30s";
}];
}];
# Firewall for NFS
networking.firewall.allowedTCPPorts = [ 2049 111 20048 ];
networking.firewall.allowedUDPPorts = [ 2049 111 20048 ];
# systemd services: NFS server split-brain check + replication services
systemd.services = lib.mkMerge ([
# Safety check: prevent split-brain by ensuring no other NFS server is active
{
nfs-server = {
preStart = ''
# Wait for Consul to be available
for i in {1..30}; do
if ${pkgs.netcat}/bin/nc -z localhost 8600; then
break
fi
echo "Waiting for Consul DNS... ($i/30)"
sleep 1
done
# Check if another NFS server is already registered in Consul
CURRENT_SERVER=$(${pkgs.dnsutils}/bin/dig +short @localhost -p 8600 data-services.service.consul | head -1 || true)
MY_IP=$(${pkgs.iproute2}/bin/ip -4 addr show | ${pkgs.gnugrep}/bin/grep -oP '(?<=inet\s)\d+(\.\d+){3}' | ${pkgs.gnugrep}/bin/grep -v '^127\.' | head -1)
if [ -n "$CURRENT_SERVER" ] && [ "$CURRENT_SERVER" != "$MY_IP" ]; then
echo "ERROR: Another NFS server is already active at $CURRENT_SERVER"
echo "This host ($MY_IP) is configured as NFS server but should be standby."
echo "To fix:"
echo " 1. If this is intentional (failback), first demote the other server"
echo " 2. Update this host's config to use nfs-services-standby.nix instead"
echo " 3. Sync data from active server before promoting this host"
exit 1
fi
echo "NFS server startup check passed (no other active server found)"
'';
};
}
] ++ (lib.forEach cfg.standbys (standby: {
"replicate-services-to-${standby}" = {
description = "Replicate /persist/services to ${standby}";
path = [ pkgs.btrfs-progs pkgs.openssh pkgs.coreutils pkgs.findutils pkgs.gnugrep ];
script = ''
set -euo pipefail
SSH_KEY="/persist/root/.ssh/btrfs-replication"
if [ ! -f "$SSH_KEY" ]; then
echo "ERROR: SSH key not found at $SSH_KEY"
echo "Run: sudo ssh-keygen -t ed25519 -f $SSH_KEY -N \"\" -C \"root@$(hostname)-replication\""
exit 1
fi
SNAPSHOT_NAME="services@$(date +%Y%m%d-%H%M%S)"
SNAPSHOT_PATH="/persist/$SNAPSHOT_NAME"
# Create readonly snapshot
btrfs subvolume snapshot -r /persist/services "$SNAPSHOT_PATH"
# Find previous snapshot on sender
PREV_LOCAL=$(ls -t /persist/services@* 2>/dev/null | grep -v "^$SNAPSHOT_PATH$" | head -1 || true)
# Check what snapshots exist on the receiver
REMOTE_SNAPSHOTS=$(ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
"ls -t /persist/services-standby/services@* 2>/dev/null || true")
# Decide: incremental or full send
if [ -n "$PREV_LOCAL" ] && echo "$REMOTE_SNAPSHOTS" | grep -q "$(basename "$PREV_LOCAL")"; then
# Receiver has the parent snapshot, do incremental
echo "Incremental send from $(basename $PREV_LOCAL) to ${standby}"
btrfs send -p "$PREV_LOCAL" "$SNAPSHOT_PATH" | \
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
"btrfs receive /persist/services-standby"
else
# Receiver doesn't have parent (new standby or missing snapshot), do full send
echo "Full send to ${standby} (new standby or parent snapshot not found on receiver)"
btrfs send "$SNAPSHOT_PATH" | \
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
"btrfs receive /persist/services-standby"
fi
# Cleanup old snapshots on sender (keep last 24 hours = 288 snapshots at 5min intervals)
find /persist -maxdepth 1 -name 'services@*' -mmin +1440 -exec btrfs subvolume delete {} \;
'';
serviceConfig = {
Type = "oneshot";
User = "root";
};
};
}))
);
systemd.timers = lib.mkMerge (
lib.forEach cfg.standbys (standby: {
"replicate-services-to-${standby}" = {
description = "Timer for replicating /persist/services to ${standby}";
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "*:0/5"; # Every 5 minutes
Persistent = true;
};
};
})
);
};
}

View File

@@ -0,0 +1,68 @@
{ config, lib, pkgs, ... }:
let
cfg = config.nfsServicesStandby;
in
{
options.nfsServicesStandby = {
enable = lib.mkEnableOption "NFS services standby" // { default = true; };
replicationKeys = lib.mkOption {
type = lib.types.listOf lib.types.str;
default = [];
description = ''
SSH public keys authorized to replicate btrfs snapshots to this standby.
These keys are restricted to only run 'btrfs receive /persist/services-standby'.
Get the public key from the NFS server:
ssh <nfs-server> sudo cat /persist/root/.ssh/btrfs-replication.pub
'';
};
};
config = lib.mkIf cfg.enable {
# Allow root SSH login for replication (restricted by command= in authorized_keys)
# This is configured in common/sshd.nix
# Restricted SSH keys for btrfs replication
users.users.root.openssh.authorizedKeys.keys =
map (key: ''command="btrfs receive /persist/services-standby",restrict ${key}'') cfg.replicationKeys;
# Mount point for services-standby subvolume
# This is just declarative documentation - the subvolume must be created manually once:
# sudo btrfs subvolume create /persist/services-standby
# After that, it will persist across reboots (it's under /persist)
fileSystems."/persist/services-standby" = {
device = "/persist/services-standby";
fsType = "none";
options = [ "bind" ];
noCheck = true;
};
# Cleanup old snapshots on standby (keep last 48 hours for safety)
systemd.services.cleanup-services-standby-snapshots = {
description = "Cleanup old btrfs snapshots in services-standby";
path = [ pkgs.btrfs-progs pkgs.findutils ];
script = ''
set -euo pipefail
# Keep last 48 hours of snapshots (576 snapshots at 5min intervals)
find /persist/services-standby -maxdepth 1 -name 'services@*' -mmin +2880 -exec btrfs subvolume delete {} \; || true
'';
serviceConfig = {
Type = "oneshot";
User = "root";
};
};
systemd.timers.cleanup-services-standby-snapshots = {
description = "Timer for cleaning up old snapshots on standby";
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "daily";
Persistent = true;
};
};
};
}

View File

@@ -5,6 +5,7 @@
settings = { settings = {
PasswordAuthentication = false; PasswordAuthentication = false;
KbdInteractiveAuthentication = false; KbdInteractiveAuthentication = false;
PermitRootLogin = "prohibit-password"; # Allow root login with SSH keys only
}; };
}; };

438
docs/NFS_FAILOVER.md Normal file
View File

@@ -0,0 +1,438 @@
# NFS Services Failover Procedures
This document describes how to fail over the `/data/services` NFS server between hosts and how to fail back.
## Architecture Overview
- **Primary NFS Server**: Typically `zippy`
- Exports `/persist/services` via NFS
- Has local bind mount: `/data/services``/persist/services` (same path as clients)
- Registers `data-services.service.consul` in Consul
- Sets Nomad node meta: `storage_role = "primary"`
- Replicates snapshots to standbys every 5 minutes via btrfs send
- **Safety check**: Refuses to start if another NFS server is already active in Consul
- **Standby**: Typically `c1`
- Receives snapshots at `/persist/services-standby/services@<timestamp>`
- Can be promoted to NFS server during failover
- No special Nomad node meta (not primary)
- **Clients**: All cluster nodes (c1, c2, c3, zippy)
- Mount `/data/services` from `data-services.service.consul:/persist/services`
- Automatically connect to whoever is registered in Consul
### Nomad Job Constraints
Jobs that need to run on the primary storage node should use:
```hcl
constraint {
attribute = "${meta.storage_role}"
value = "primary"
}
```
This is useful for:
- Database jobs (mysql, postgres, redis) that benefit from local storage
- Jobs that need guaranteed fast disk I/O
During failover, the `storage_role = "primary"` meta attribute moves to the new NFS server, and Nomad automatically reschedules constrained jobs to the new primary.
## Prerequisites
- Standby has been receiving snapshots (check: `ls /persist/services-standby/services@*`)
- Last successful replication was recent (< 5-10 minutes)
---
## Failover: Promoting Standby to Primary
**Scenario**: `zippy` is down and you need to promote `c1` to be the NFS server.
### Step 1: Choose Latest Snapshot
On the standby (c1):
```bash
ssh c1
sudo ls -lt /persist/services-standby/services@* | head -5
```
Find the most recent snapshot. Note the timestamp to estimate data loss (typically < 5 minutes).
### Step 2: Promote Snapshot to Read-Write Subvolume
On c1:
```bash
# Find the latest snapshot
LATEST=$(sudo ls -t /persist/services-standby/services@* | head -1)
# Create writable subvolume from snapshot
sudo btrfs subvolume snapshot "$LATEST" /persist/services
# Verify
ls -la /persist/services
```
### Step 3: Update NixOS Configuration
Edit your configuration to swap the NFS server role:
**In `hosts/c1/default.nix`**:
```nix
imports = [
# ... existing imports ...
# ../../common/nfs-services-standby.nix # REMOVE THIS
../../common/nfs-services-server.nix # ADD THIS
];
# Add standbys if desired (optional - can leave empty during emergency)
nfsServicesServer.standbys = []; # Or ["c2"] to add a new standby
```
**Optional: Prepare zippy config for when it comes back**:
In `hosts/zippy/default.nix` (can do this later too):
```nix
imports = [
# ... existing imports ...
# ../../common/nfs-services-server.nix # REMOVE THIS
../../common/nfs-services-standby.nix # ADD THIS
];
# Add the replication key from c1 (get it from c1:/persist/root/.ssh/btrfs-replication.pub)
nfsServicesStandby.replicationKeys = [
"ssh-ed25519 AAAA... root@c1-replication"
];
```
### Step 4: Deploy Configuration
```bash
# From your workstation
deploy -s '.#c1'
# If zippy is still down, updating its config will fail, but that's okay
# You can update it later when it comes back
```
### Step 5: Verify NFS Server is Running
On c1:
```bash
sudo systemctl status nfs-server
sudo showmount -e localhost
dig @localhost -p 8600 data-services.service.consul # Should show c1's IP
```
### Step 6: Verify Clients Can Access
From any node:
```bash
df -h | grep services
ls /data/services
```
The mount should automatically reconnect via Consul DNS.
### Step 7: Check Nomad Jobs
```bash
nomad job status mysql
nomad job status postgres
# Verify critical services are healthy
# Jobs constrained to ${meta.storage_role} = "primary" will automatically
# reschedule to c1 once it's deployed with the NFS server module
```
**Recovery Time Objective (RTO)**: ~10-15 minutes
**Recovery Point Objective (RPO)**: Last replication interval (5 minutes max)
**Note**: Jobs with the `storage_role = "primary"` constraint will automatically move to c1 because it now has that node meta attribute. No job spec changes needed!
---
## What Happens When zippy Comes Back?
**IMPORTANT**: If zippy reboots while still configured as NFS server, it will **refuse to start** the NFS service because it detects c1 is already active in Consul.
You'll see this error in `journalctl -u nfs-server`:
```
ERROR: Another NFS server is already active at 192.168.1.X
This host (192.168.1.2) is configured as NFS server but should be standby.
To fix:
1. If this is intentional (failback), first demote the other server
2. Update this host's config to use nfs-services-standby.nix instead
3. Sync data from active server before promoting this host
```
This is a **safety feature** to prevent split-brain and data corruption.
### Options when zippy comes back:
**Option A: Keep c1 as primary** (zippy becomes standby)
1. Update zippy's config to use `nfs-services-standby.nix`
2. Deploy to zippy
3. c1 will start replicating to zippy
**Option B: Fail back to zippy as primary**
Follow the "Failing Back to Original Primary" procedure below.
---
## Failing Back to Original Primary
**Scenario**: `zippy` is repaired and you want to move the NFS server role back from `c1` to `zippy`.
### Step 1: Sync Latest Data from c1 to zippy
On c1 (current primary):
```bash
# Create readonly snapshot of current state
sudo btrfs subvolume snapshot -r /persist/services /persist/services@failback-$(date +%Y%m%d-%H%M%S)
# Find the snapshot
FAILBACK=$(sudo ls -t /persist/services@failback-* | head -1)
# Send to zippy (use root SSH key if available, or generate temporary key)
sudo btrfs send "$FAILBACK" | ssh root@zippy "btrfs receive /persist/"
```
On zippy:
```bash
# Verify snapshot arrived
ls -la /persist/services@failback-*
# Create writable subvolume from the snapshot
FAILBACK=$(ls -t /persist/services@failback-* | head -1)
sudo btrfs subvolume snapshot "$FAILBACK" /persist/services
# Verify
ls -la /persist/services
```
### Step 2: Update NixOS Configuration
Swap the roles back:
**In `hosts/zippy/default.nix`**:
```nix
imports = [
# ... existing imports ...
# ../../common/nfs-services-standby.nix # REMOVE THIS
../../common/nfs-services-server.nix # ADD THIS
];
nfsServicesServer.standbys = ["c1"];
```
**In `hosts/c1/default.nix`**:
```nix
imports = [
# ... existing imports ...
# ../../common/nfs-services-server.nix # REMOVE THIS
../../common/nfs-services-standby.nix # ADD THIS
];
nfsServicesStandby.replicationKeys = [
"ssh-ed25519 AAAA... root@zippy-replication" # Get from zippy:/persist/root/.ssh/btrfs-replication.pub
];
```
### Step 3: Deploy Configurations
```bash
# IMPORTANT: Deploy c1 FIRST to demote it
deploy -s '.#c1'
# Wait for c1 to stop NFS server
ssh c1 sudo systemctl status nfs-server # Should be inactive
# Then deploy zippy to promote it
deploy -s '.#zippy'
```
The order matters! If you deploy zippy first, it will see c1 is still active and refuse to start.
### Step 4: Verify Failback
Check Consul DNS points to zippy:
```bash
dig @c1 -p 8600 data-services.service.consul # Should show zippy's IP
```
Check clients are mounting from zippy:
```bash
for host in c1 c2 c3; do
ssh $host "df -h | grep services"
done
```
### Step 5: Clean Up Temporary Snapshots
On c1:
```bash
# Remove the failback snapshot and the promoted subvolume
sudo btrfs subvolume delete /persist/services@failback-*
sudo btrfs subvolume delete /persist/services
```
---
## Adding a New Standby
**Scenario**: You want to add `c2` as an additional standby.
### Step 1: Create Standby Subvolume on c2
```bash
ssh c2
sudo btrfs subvolume create /persist/services-standby
```
### Step 2: Update c2 Configuration
**In `hosts/c2/default.nix`**:
```nix
imports = [
# ... existing imports ...
../../common/nfs-services-standby.nix
];
nfsServicesStandby.replicationKeys = [
"ssh-ed25519 AAAA... root@zippy-replication" # Get from current NFS server
];
```
### Step 3: Update NFS Server Configuration
On the current NFS server (e.g., zippy), update the standbys list:
**In `hosts/zippy/default.nix`**:
```nix
nfsServicesServer.standbys = ["c1" "c2"]; # Added c2
```
### Step 4: Deploy
```bash
deploy -s '.#c2'
deploy -s '.#zippy'
```
The next replication cycle (within 5 minutes) will do a full send to c2, then switch to incremental.
---
## Troubleshooting
### Replication Failed
Check the replication service logs:
```bash
# On NFS server
sudo journalctl -u replicate-services-to-c1 -f
```
Common issues:
- SSH key not found → Run key generation step (see stateful-commands.txt)
- Permission denied → Check authorized_keys on standby
- Snapshot already exists → Old snapshot with same timestamp, wait for next cycle
### Clients Can't Mount
Check Consul:
```bash
dig @localhost -p 8600 data-services.service.consul
consul catalog services | grep data-services
```
If Consul isn't resolving:
- NFS server might not have registered → Check `sudo systemctl status nfs-server`
- Consul agent might be down → Check `sudo systemctl status consul`
### Mount is Stale
Force remount:
```bash
sudo systemctl restart data-services.mount
```
Or unmount and let automount handle it:
```bash
sudo umount /data/services
ls /data/services # Triggers automount
```
### Split-Brain Prevention: NFS Server Won't Start
If you see:
```
ERROR: Another NFS server is already active at 192.168.1.X
```
This is **intentional** - the safety check is working! You have two options:
1. **Keep the other server as primary**: Update this host's config to be a standby instead
2. **Fail back to this host**: First demote the other server, sync data, then deploy both hosts in correct order
---
## Monitoring
### Check Replication Status
On NFS server:
```bash
# List recent snapshots
ls -lt /persist/services@* | head
# Check last replication run
sudo systemctl status replicate-services-to-c1
# Check replication logs
sudo journalctl -u replicate-services-to-c1 --since "1 hour ago"
```
On standby:
```bash
# List received snapshots
ls -lt /persist/services-standby/services@* | head
# Check how old the latest snapshot is
stat /persist/services-standby/services@* | grep Modify | head -1
```
### Verify NFS Exports
```bash
sudo showmount -e localhost
```
Should show:
```
/persist/services 192.168.1.0/24
```
### Check Consul Registration
```bash
consul catalog services | grep data-services
dig @localhost -p 8600 data-services.service.consul
```

View File

@@ -4,6 +4,11 @@
../../common/encrypted-btrfs-layout.nix ../../common/encrypted-btrfs-layout.nix
../../common/global ../../common/global
../../common/compute-node.nix ../../common/compute-node.nix
../../common/nfs-services-standby.nix # NFS standby for /data/services
# To promote to NFS server (during failover):
# 1. Follow procedure in docs/NFS_FAILOVER.md
# 2. Replace above line with: ../../common/nfs-services-server.nix
# 3. Add nfsServicesServer.standbys = [ "c2" ]; (or leave empty)
./hardware.nix ./hardware.nix
]; ];
@@ -15,4 +20,9 @@
networking.hostName = "c1"; networking.hostName = "c1";
services.tailscaleAutoconnect.authkey = "tskey-auth-k2nQ771YHM11CNTRL-YVpoumL2mgR6nLPG51vNhRpEKMDN7gLAi"; services.tailscaleAutoconnect.authkey = "tskey-auth-k2nQ771YHM11CNTRL-YVpoumL2mgR6nLPG51vNhRpEKMDN7gLAi";
# NFS standby configuration: accept replication from zippy
nfsServicesStandby.replicationKeys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHyTKsMCbwCIlMcC/aopgz5Yfx/Q9QdlWC9jzMLgYFAV root@zippy-replication"
];
} }

View File

@@ -5,6 +5,11 @@
../../common/global ../../common/global
../../common/compute-node.nix ../../common/compute-node.nix
# ../../common/ethereum.nix # ../../common/ethereum.nix
../../common/nfs-services-server.nix # NFS server for /data/services
# To move NFS server role to another host:
# 1. Follow procedure in docs/NFS_FAILOVER.md
# 2. Replace above line with: ../../common/nfs-services-standby.nix
# 3. Add nfsServicesStandby.replicationKeys with the new server's public key
./hardware.nix ./hardware.nix
]; ];
@@ -16,4 +21,7 @@
networking.hostName = "zippy"; networking.hostName = "zippy";
services.tailscaleAutoconnect.authkey = "tskey-auth-ktKyQ59f2p11CNTRL-ut8E71dLWPXsVtb92hevNX9RTjmk4owBf"; services.tailscaleAutoconnect.authkey = "tskey-auth-ktKyQ59f2p11CNTRL-ut8E71dLWPXsVtb92hevNX9RTjmk4owBf";
# NFS server configuration: replicate to c1 as standby
nfsServicesServer.standbys = [ "c1" ];
} }

View File

@@ -39,3 +39,22 @@ kopia repository server setup (on a non-NixOS host at the time):
* kopia server start --address 0.0.0.0:51515 --tls-cert-file ~/kopia-certs/kopia.cert --tls-key-file ~/kopia-certs/kopia.key --tls-generate-cert (first time) * kopia server start --address 0.0.0.0:51515 --tls-cert-file ~/kopia-certs/kopia.cert --tls-key-file ~/kopia-certs/kopia.key --tls-generate-cert (first time)
* kopia server start --address 0.0.0.0:51515 --tls-cert-file ~/kopia-certs/kopia.cert --tls-key-file ~/kopia-certs/kopia.key (subsequent) * kopia server start --address 0.0.0.0:51515 --tls-cert-file ~/kopia-certs/kopia.cert --tls-key-file ~/kopia-certs/kopia.key (subsequent)
[TLS is mandatory for this] [TLS is mandatory for this]
NFS services server setup (one-time on the NFS server host, e.g. zippy):
* sudo btrfs subvolume create /persist/services
* sudo mkdir -p /persist/root/.ssh
* sudo ssh-keygen -t ed25519 -f /persist/root/.ssh/btrfs-replication -N "" -C "root@$(hostname)-replication"
* Get the public key: sudo cat /persist/root/.ssh/btrfs-replication.pub
Then add this public key to each standby's nfsServicesStandby.replicationKeys option
NFS services standby setup (one-time on each standby host, e.g. c1):
* sudo btrfs subvolume create /persist/services-standby
Moving NFS server role between hosts (e.g. from zippy to c1):
See docs/NFS_FAILOVER.md for detailed procedure
Summary:
1. On current primary: create final snapshot and send to new primary
2. On new primary: promote snapshot to /persist/services
3. Update configs: remove nfs-services-server.nix from old primary, add to new primary
4. Update configs: add nfs-services-standby.nix to old primary (with replication keys)
5. Deploy old primary first (to demote), then new primary (to promote)