diff --git a/common/cluster-node.nix b/common/cluster-node.nix index ff69cc7..b7ecb6f 100644 --- a/common/cluster-node.nix +++ b/common/cluster-node.nix @@ -1,13 +1,14 @@ { pkgs, ... }: { # Cluster node configuration - # Extends minimal-node with cluster-specific services (Consul, GlusterFS, CIFS) + # Extends minimal-node with cluster-specific services (Consul, GlusterFS, CIFS, NFS) # Used by: compute nodes (c1, c2, c3) imports = [ ./minimal-node.nix ./unattended-encryption.nix ./cifs-client.nix ./consul.nix - ./glusterfs-client.nix + ./glusterfs-client.nix # Keep during migration, will be removed in Phase 3 + ./nfs-services-client.nix # New: NFS client for /data/services ]; } diff --git a/common/nfs-services-client.nix b/common/nfs-services-client.nix new file mode 100644 index 0000000..c879d2c --- /dev/null +++ b/common/nfs-services-client.nix @@ -0,0 +1,21 @@ +{ pkgs, ... }: +{ + # NFS client for /data/services + # Mounts from data-services.service.consul (Consul DNS for automatic failover) + # The NFS server registers itself in Consul, so this will automatically + # point to whichever host is currently running the NFS server + + fileSystems."/data/services" = { + device = "data-services.service.consul:/persist/services"; + fsType = "nfs"; + options = [ + "x-systemd.automount" # Auto-mount on access + "noauto" # Don't mount at boot (automount handles it) + "x-systemd.idle-timeout=60" # Unmount after 60s of inactivity + "_netdev" # Network filesystem (wait for network) + ]; + }; + + # Ensure NFS client packages are available + environment.systemPackages = [ pkgs.nfs-utils ]; +} diff --git a/common/nfs-services-server.nix b/common/nfs-services-server.nix new file mode 100644 index 0000000..8523caa --- /dev/null +++ b/common/nfs-services-server.nix @@ -0,0 +1,171 @@ +{ config, lib, pkgs, ... }: + +let + cfg = config.nfsServicesServer; +in +{ + options.nfsServicesServer = { + enable = lib.mkEnableOption "NFS services server" // { default = true; }; + + standbys = lib.mkOption { + type = lib.types.listOf lib.types.str; + default = []; + description = '' + List of standby hostnames to replicate to (e.g. ["c1"]). + + Requires one-time setup on the NFS server: + sudo mkdir -p /persist/root/.ssh + sudo ssh-keygen -t ed25519 -f /persist/root/.ssh/btrfs-replication -N "" -C "root@$(hostname)-replication" + + Then add the public key to each standby's nfsServicesStandby.replicationKeys option. + ''; + }; + }; + + config = lib.mkIf cfg.enable { + # Persist root SSH directory for replication key + environment.persistence."/persist" = { + directories = [ + "/root/.ssh" + ]; + }; + + # Bind mount /persist/services to /data/services for local access + # This makes the path consistent with NFS clients + # Use mkForce to override the NFS client mount from cluster-node.nix + fileSystems."/data/services" = lib.mkForce { + device = "/persist/services"; + fsType = "none"; + options = [ "bind" ]; + }; + + # Nomad node metadata: mark this as the primary storage node + # Jobs can constrain to ${meta.storage_role} = "primary" + services.nomad.settings.client.meta = { + storage_role = "primary"; + }; + + # NFS server configuration + services.nfs.server = { + enable = true; + exports = '' + /persist/services 192.168.1.0/24(rw,sync,no_subtree_check,no_root_squash) + ''; + }; + + # Consul service registration for NFS + services.consul.extraConfig.services = [{ + name = "data-services"; + port = 2049; + checks = [{ + tcp = "localhost:2049"; + interval = "30s"; + }]; + }]; + + # Firewall for NFS + networking.firewall.allowedTCPPorts = [ 2049 111 20048 ]; + networking.firewall.allowedUDPPorts = [ 2049 111 20048 ]; + + # systemd services: NFS server split-brain check + replication services + systemd.services = lib.mkMerge ([ + # Safety check: prevent split-brain by ensuring no other NFS server is active + { + nfs-server = { + preStart = '' + # Wait for Consul to be available + for i in {1..30}; do + if ${pkgs.netcat}/bin/nc -z localhost 8600; then + break + fi + echo "Waiting for Consul DNS... ($i/30)" + sleep 1 + done + + # Check if another NFS server is already registered in Consul + CURRENT_SERVER=$(${pkgs.dnsutils}/bin/dig +short @localhost -p 8600 data-services.service.consul | head -1 || true) + MY_IP=$(${pkgs.iproute2}/bin/ip -4 addr show | ${pkgs.gnugrep}/bin/grep -oP '(?<=inet\s)\d+(\.\d+){3}' | ${pkgs.gnugrep}/bin/grep -v '^127\.' | head -1) + + if [ -n "$CURRENT_SERVER" ] && [ "$CURRENT_SERVER" != "$MY_IP" ]; then + echo "ERROR: Another NFS server is already active at $CURRENT_SERVER" + echo "This host ($MY_IP) is configured as NFS server but should be standby." + echo "To fix:" + echo " 1. If this is intentional (failback), first demote the other server" + echo " 2. Update this host's config to use nfs-services-standby.nix instead" + echo " 3. Sync data from active server before promoting this host" + exit 1 + fi + + echo "NFS server startup check passed (no other active server found)" + ''; + }; + } + ] ++ (lib.forEach cfg.standbys (standby: { + "replicate-services-to-${standby}" = { + description = "Replicate /persist/services to ${standby}"; + path = [ pkgs.btrfs-progs pkgs.openssh pkgs.coreutils pkgs.findutils pkgs.gnugrep ]; + + script = '' + set -euo pipefail + + SSH_KEY="/persist/root/.ssh/btrfs-replication" + if [ ! -f "$SSH_KEY" ]; then + echo "ERROR: SSH key not found at $SSH_KEY" + echo "Run: sudo ssh-keygen -t ed25519 -f $SSH_KEY -N \"\" -C \"root@$(hostname)-replication\"" + exit 1 + fi + + SNAPSHOT_NAME="services@$(date +%Y%m%d-%H%M%S)" + SNAPSHOT_PATH="/persist/$SNAPSHOT_NAME" + + # Create readonly snapshot + btrfs subvolume snapshot -r /persist/services "$SNAPSHOT_PATH" + + # Find previous snapshot on sender + PREV_LOCAL=$(ls -t /persist/services@* 2>/dev/null | grep -v "^$SNAPSHOT_PATH$" | head -1 || true) + + # Check what snapshots exist on the receiver + REMOTE_SNAPSHOTS=$(ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \ + "ls -t /persist/services-standby/services@* 2>/dev/null || true") + + # Decide: incremental or full send + if [ -n "$PREV_LOCAL" ] && echo "$REMOTE_SNAPSHOTS" | grep -q "$(basename "$PREV_LOCAL")"; then + # Receiver has the parent snapshot, do incremental + echo "Incremental send from $(basename $PREV_LOCAL) to ${standby}" + btrfs send -p "$PREV_LOCAL" "$SNAPSHOT_PATH" | \ + ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \ + "btrfs receive /persist/services-standby" + else + # Receiver doesn't have parent (new standby or missing snapshot), do full send + echo "Full send to ${standby} (new standby or parent snapshot not found on receiver)" + btrfs send "$SNAPSHOT_PATH" | \ + ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \ + "btrfs receive /persist/services-standby" + fi + + # Cleanup old snapshots on sender (keep last 24 hours = 288 snapshots at 5min intervals) + find /persist -maxdepth 1 -name 'services@*' -mmin +1440 -exec btrfs subvolume delete {} \; + ''; + + serviceConfig = { + Type = "oneshot"; + User = "root"; + }; + }; + })) + ); + + systemd.timers = lib.mkMerge ( + lib.forEach cfg.standbys (standby: { + "replicate-services-to-${standby}" = { + description = "Timer for replicating /persist/services to ${standby}"; + wantedBy = [ "timers.target" ]; + timerConfig = { + OnCalendar = "*:0/5"; # Every 5 minutes + Persistent = true; + }; + }; + }) + ); + }; +} diff --git a/common/nfs-services-standby.nix b/common/nfs-services-standby.nix new file mode 100644 index 0000000..f241a7a --- /dev/null +++ b/common/nfs-services-standby.nix @@ -0,0 +1,68 @@ +{ config, lib, pkgs, ... }: + +let + cfg = config.nfsServicesStandby; +in +{ + options.nfsServicesStandby = { + enable = lib.mkEnableOption "NFS services standby" // { default = true; }; + + replicationKeys = lib.mkOption { + type = lib.types.listOf lib.types.str; + default = []; + description = '' + SSH public keys authorized to replicate btrfs snapshots to this standby. + These keys are restricted to only run 'btrfs receive /persist/services-standby'. + + Get the public key from the NFS server: + ssh sudo cat /persist/root/.ssh/btrfs-replication.pub + ''; + }; + }; + + config = lib.mkIf cfg.enable { + # Allow root SSH login for replication (restricted by command= in authorized_keys) + # This is configured in common/sshd.nix + + # Restricted SSH keys for btrfs replication + users.users.root.openssh.authorizedKeys.keys = + map (key: ''command="btrfs receive /persist/services-standby",restrict ${key}'') cfg.replicationKeys; + + # Mount point for services-standby subvolume + # This is just declarative documentation - the subvolume must be created manually once: + # sudo btrfs subvolume create /persist/services-standby + # After that, it will persist across reboots (it's under /persist) + fileSystems."/persist/services-standby" = { + device = "/persist/services-standby"; + fsType = "none"; + options = [ "bind" ]; + noCheck = true; + }; + + # Cleanup old snapshots on standby (keep last 48 hours for safety) + systemd.services.cleanup-services-standby-snapshots = { + description = "Cleanup old btrfs snapshots in services-standby"; + path = [ pkgs.btrfs-progs pkgs.findutils ]; + + script = '' + set -euo pipefail + # Keep last 48 hours of snapshots (576 snapshots at 5min intervals) + find /persist/services-standby -maxdepth 1 -name 'services@*' -mmin +2880 -exec btrfs subvolume delete {} \; || true + ''; + + serviceConfig = { + Type = "oneshot"; + User = "root"; + }; + }; + + systemd.timers.cleanup-services-standby-snapshots = { + description = "Timer for cleaning up old snapshots on standby"; + wantedBy = [ "timers.target" ]; + timerConfig = { + OnCalendar = "daily"; + Persistent = true; + }; + }; + }; +} diff --git a/common/sshd.nix b/common/sshd.nix index a771e33..45790c6 100644 --- a/common/sshd.nix +++ b/common/sshd.nix @@ -5,6 +5,7 @@ settings = { PasswordAuthentication = false; KbdInteractiveAuthentication = false; + PermitRootLogin = "prohibit-password"; # Allow root login with SSH keys only }; }; diff --git a/docs/NFS_FAILOVER.md b/docs/NFS_FAILOVER.md new file mode 100644 index 0000000..5c9ae32 --- /dev/null +++ b/docs/NFS_FAILOVER.md @@ -0,0 +1,438 @@ +# NFS Services Failover Procedures + +This document describes how to fail over the `/data/services` NFS server between hosts and how to fail back. + +## Architecture Overview + +- **Primary NFS Server**: Typically `zippy` + - Exports `/persist/services` via NFS + - Has local bind mount: `/data/services` → `/persist/services` (same path as clients) + - Registers `data-services.service.consul` in Consul + - Sets Nomad node meta: `storage_role = "primary"` + - Replicates snapshots to standbys every 5 minutes via btrfs send + - **Safety check**: Refuses to start if another NFS server is already active in Consul + +- **Standby**: Typically `c1` + - Receives snapshots at `/persist/services-standby/services@` + - Can be promoted to NFS server during failover + - No special Nomad node meta (not primary) + +- **Clients**: All cluster nodes (c1, c2, c3, zippy) + - Mount `/data/services` from `data-services.service.consul:/persist/services` + - Automatically connect to whoever is registered in Consul + +### Nomad Job Constraints + +Jobs that need to run on the primary storage node should use: + +```hcl +constraint { + attribute = "${meta.storage_role}" + value = "primary" +} +``` + +This is useful for: +- Database jobs (mysql, postgres, redis) that benefit from local storage +- Jobs that need guaranteed fast disk I/O + +During failover, the `storage_role = "primary"` meta attribute moves to the new NFS server, and Nomad automatically reschedules constrained jobs to the new primary. + +## Prerequisites + +- Standby has been receiving snapshots (check: `ls /persist/services-standby/services@*`) +- Last successful replication was recent (< 5-10 minutes) + +--- + +## Failover: Promoting Standby to Primary + +**Scenario**: `zippy` is down and you need to promote `c1` to be the NFS server. + +### Step 1: Choose Latest Snapshot + +On the standby (c1): + +```bash +ssh c1 +sudo ls -lt /persist/services-standby/services@* | head -5 +``` + +Find the most recent snapshot. Note the timestamp to estimate data loss (typically < 5 minutes). + +### Step 2: Promote Snapshot to Read-Write Subvolume + +On c1: + +```bash +# Find the latest snapshot +LATEST=$(sudo ls -t /persist/services-standby/services@* | head -1) + +# Create writable subvolume from snapshot +sudo btrfs subvolume snapshot "$LATEST" /persist/services + +# Verify +ls -la /persist/services +``` + +### Step 3: Update NixOS Configuration + +Edit your configuration to swap the NFS server role: + +**In `hosts/c1/default.nix`**: +```nix +imports = [ + # ... existing imports ... + # ../../common/nfs-services-standby.nix # REMOVE THIS + ../../common/nfs-services-server.nix # ADD THIS +]; + +# Add standbys if desired (optional - can leave empty during emergency) +nfsServicesServer.standbys = []; # Or ["c2"] to add a new standby +``` + +**Optional: Prepare zippy config for when it comes back**: + +In `hosts/zippy/default.nix` (can do this later too): +```nix +imports = [ + # ... existing imports ... + # ../../common/nfs-services-server.nix # REMOVE THIS + ../../common/nfs-services-standby.nix # ADD THIS +]; + +# Add the replication key from c1 (get it from c1:/persist/root/.ssh/btrfs-replication.pub) +nfsServicesStandby.replicationKeys = [ + "ssh-ed25519 AAAA... root@c1-replication" +]; +``` + +### Step 4: Deploy Configuration + +```bash +# From your workstation +deploy -s '.#c1' + +# If zippy is still down, updating its config will fail, but that's okay +# You can update it later when it comes back +``` + +### Step 5: Verify NFS Server is Running + +On c1: + +```bash +sudo systemctl status nfs-server +sudo showmount -e localhost +dig @localhost -p 8600 data-services.service.consul # Should show c1's IP +``` + +### Step 6: Verify Clients Can Access + +From any node: + +```bash +df -h | grep services +ls /data/services +``` + +The mount should automatically reconnect via Consul DNS. + +### Step 7: Check Nomad Jobs + +```bash +nomad job status mysql +nomad job status postgres +# Verify critical services are healthy + +# Jobs constrained to ${meta.storage_role} = "primary" will automatically +# reschedule to c1 once it's deployed with the NFS server module +``` + +**Recovery Time Objective (RTO)**: ~10-15 minutes +**Recovery Point Objective (RPO)**: Last replication interval (5 minutes max) + +**Note**: Jobs with the `storage_role = "primary"` constraint will automatically move to c1 because it now has that node meta attribute. No job spec changes needed! + +--- + +## What Happens When zippy Comes Back? + +**IMPORTANT**: If zippy reboots while still configured as NFS server, it will **refuse to start** the NFS service because it detects c1 is already active in Consul. + +You'll see this error in `journalctl -u nfs-server`: + +``` +ERROR: Another NFS server is already active at 192.168.1.X +This host (192.168.1.2) is configured as NFS server but should be standby. +To fix: + 1. If this is intentional (failback), first demote the other server + 2. Update this host's config to use nfs-services-standby.nix instead + 3. Sync data from active server before promoting this host +``` + +This is a **safety feature** to prevent split-brain and data corruption. + +### Options when zippy comes back: + +**Option A: Keep c1 as primary** (zippy becomes standby) +1. Update zippy's config to use `nfs-services-standby.nix` +2. Deploy to zippy +3. c1 will start replicating to zippy + +**Option B: Fail back to zippy as primary** +Follow the "Failing Back to Original Primary" procedure below. + +--- + +## Failing Back to Original Primary + +**Scenario**: `zippy` is repaired and you want to move the NFS server role back from `c1` to `zippy`. + +### Step 1: Sync Latest Data from c1 to zippy + +On c1 (current primary): + +```bash +# Create readonly snapshot of current state +sudo btrfs subvolume snapshot -r /persist/services /persist/services@failback-$(date +%Y%m%d-%H%M%S) + +# Find the snapshot +FAILBACK=$(sudo ls -t /persist/services@failback-* | head -1) + +# Send to zippy (use root SSH key if available, or generate temporary key) +sudo btrfs send "$FAILBACK" | ssh root@zippy "btrfs receive /persist/" +``` + +On zippy: + +```bash +# Verify snapshot arrived +ls -la /persist/services@failback-* + +# Create writable subvolume from the snapshot +FAILBACK=$(ls -t /persist/services@failback-* | head -1) +sudo btrfs subvolume snapshot "$FAILBACK" /persist/services + +# Verify +ls -la /persist/services +``` + +### Step 2: Update NixOS Configuration + +Swap the roles back: + +**In `hosts/zippy/default.nix`**: +```nix +imports = [ + # ... existing imports ... + # ../../common/nfs-services-standby.nix # REMOVE THIS + ../../common/nfs-services-server.nix # ADD THIS +]; + +nfsServicesServer.standbys = ["c1"]; +``` + +**In `hosts/c1/default.nix`**: +```nix +imports = [ + # ... existing imports ... + # ../../common/nfs-services-server.nix # REMOVE THIS + ../../common/nfs-services-standby.nix # ADD THIS +]; + +nfsServicesStandby.replicationKeys = [ + "ssh-ed25519 AAAA... root@zippy-replication" # Get from zippy:/persist/root/.ssh/btrfs-replication.pub +]; +``` + +### Step 3: Deploy Configurations + +```bash +# IMPORTANT: Deploy c1 FIRST to demote it +deploy -s '.#c1' + +# Wait for c1 to stop NFS server +ssh c1 sudo systemctl status nfs-server # Should be inactive + +# Then deploy zippy to promote it +deploy -s '.#zippy' +``` + +The order matters! If you deploy zippy first, it will see c1 is still active and refuse to start. + +### Step 4: Verify Failback + +Check Consul DNS points to zippy: + +```bash +dig @c1 -p 8600 data-services.service.consul # Should show zippy's IP +``` + +Check clients are mounting from zippy: + +```bash +for host in c1 c2 c3; do + ssh $host "df -h | grep services" +done +``` + +### Step 5: Clean Up Temporary Snapshots + +On c1: + +```bash +# Remove the failback snapshot and the promoted subvolume +sudo btrfs subvolume delete /persist/services@failback-* +sudo btrfs subvolume delete /persist/services +``` + +--- + +## Adding a New Standby + +**Scenario**: You want to add `c2` as an additional standby. + +### Step 1: Create Standby Subvolume on c2 + +```bash +ssh c2 +sudo btrfs subvolume create /persist/services-standby +``` + +### Step 2: Update c2 Configuration + +**In `hosts/c2/default.nix`**: +```nix +imports = [ + # ... existing imports ... + ../../common/nfs-services-standby.nix +]; + +nfsServicesStandby.replicationKeys = [ + "ssh-ed25519 AAAA... root@zippy-replication" # Get from current NFS server +]; +``` + +### Step 3: Update NFS Server Configuration + +On the current NFS server (e.g., zippy), update the standbys list: + +**In `hosts/zippy/default.nix`**: +```nix +nfsServicesServer.standbys = ["c1" "c2"]; # Added c2 +``` + +### Step 4: Deploy + +```bash +deploy -s '.#c2' +deploy -s '.#zippy' +``` + +The next replication cycle (within 5 minutes) will do a full send to c2, then switch to incremental. + +--- + +## Troubleshooting + +### Replication Failed + +Check the replication service logs: + +```bash +# On NFS server +sudo journalctl -u replicate-services-to-c1 -f +``` + +Common issues: +- SSH key not found → Run key generation step (see stateful-commands.txt) +- Permission denied → Check authorized_keys on standby +- Snapshot already exists → Old snapshot with same timestamp, wait for next cycle + +### Clients Can't Mount + +Check Consul: + +```bash +dig @localhost -p 8600 data-services.service.consul +consul catalog services | grep data-services +``` + +If Consul isn't resolving: +- NFS server might not have registered → Check `sudo systemctl status nfs-server` +- Consul agent might be down → Check `sudo systemctl status consul` + +### Mount is Stale + +Force remount: + +```bash +sudo systemctl restart data-services.mount +``` + +Or unmount and let automount handle it: + +```bash +sudo umount /data/services +ls /data/services # Triggers automount +``` + +### Split-Brain Prevention: NFS Server Won't Start + +If you see: +``` +ERROR: Another NFS server is already active at 192.168.1.X +``` + +This is **intentional** - the safety check is working! You have two options: + +1. **Keep the other server as primary**: Update this host's config to be a standby instead +2. **Fail back to this host**: First demote the other server, sync data, then deploy both hosts in correct order + +--- + +## Monitoring + +### Check Replication Status + +On NFS server: + +```bash +# List recent snapshots +ls -lt /persist/services@* | head + +# Check last replication run +sudo systemctl status replicate-services-to-c1 + +# Check replication logs +sudo journalctl -u replicate-services-to-c1 --since "1 hour ago" +``` + +On standby: + +```bash +# List received snapshots +ls -lt /persist/services-standby/services@* | head + +# Check how old the latest snapshot is +stat /persist/services-standby/services@* | grep Modify | head -1 +``` + +### Verify NFS Exports + +```bash +sudo showmount -e localhost +``` + +Should show: +``` +/persist/services 192.168.1.0/24 +``` + +### Check Consul Registration + +```bash +consul catalog services | grep data-services +dig @localhost -p 8600 data-services.service.consul +``` diff --git a/hosts/c1/default.nix b/hosts/c1/default.nix index e964bac..5a55f65 100644 --- a/hosts/c1/default.nix +++ b/hosts/c1/default.nix @@ -4,6 +4,11 @@ ../../common/encrypted-btrfs-layout.nix ../../common/global ../../common/compute-node.nix + ../../common/nfs-services-standby.nix # NFS standby for /data/services + # To promote to NFS server (during failover): + # 1. Follow procedure in docs/NFS_FAILOVER.md + # 2. Replace above line with: ../../common/nfs-services-server.nix + # 3. Add nfsServicesServer.standbys = [ "c2" ]; (or leave empty) ./hardware.nix ]; @@ -15,4 +20,9 @@ networking.hostName = "c1"; services.tailscaleAutoconnect.authkey = "tskey-auth-k2nQ771YHM11CNTRL-YVpoumL2mgR6nLPG51vNhRpEKMDN7gLAi"; + + # NFS standby configuration: accept replication from zippy + nfsServicesStandby.replicationKeys = [ + "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHyTKsMCbwCIlMcC/aopgz5Yfx/Q9QdlWC9jzMLgYFAV root@zippy-replication" + ]; } diff --git a/hosts/zippy/default.nix b/hosts/zippy/default.nix index 9c815bf..eb07e8e 100644 --- a/hosts/zippy/default.nix +++ b/hosts/zippy/default.nix @@ -5,6 +5,11 @@ ../../common/global ../../common/compute-node.nix # ../../common/ethereum.nix + ../../common/nfs-services-server.nix # NFS server for /data/services + # To move NFS server role to another host: + # 1. Follow procedure in docs/NFS_FAILOVER.md + # 2. Replace above line with: ../../common/nfs-services-standby.nix + # 3. Add nfsServicesStandby.replicationKeys with the new server's public key ./hardware.nix ]; @@ -16,4 +21,7 @@ networking.hostName = "zippy"; services.tailscaleAutoconnect.authkey = "tskey-auth-ktKyQ59f2p11CNTRL-ut8E71dLWPXsVtb92hevNX9RTjmk4owBf"; + + # NFS server configuration: replicate to c1 as standby + nfsServicesServer.standbys = [ "c1" ]; } diff --git a/stateful-commands.txt b/stateful-commands.txt index 25f6a51..306fe29 100644 --- a/stateful-commands.txt +++ b/stateful-commands.txt @@ -39,3 +39,22 @@ kopia repository server setup (on a non-NixOS host at the time): * kopia server start --address 0.0.0.0:51515 --tls-cert-file ~/kopia-certs/kopia.cert --tls-key-file ~/kopia-certs/kopia.key --tls-generate-cert (first time) * kopia server start --address 0.0.0.0:51515 --tls-cert-file ~/kopia-certs/kopia.cert --tls-key-file ~/kopia-certs/kopia.key (subsequent) [TLS is mandatory for this] + +NFS services server setup (one-time on the NFS server host, e.g. zippy): + * sudo btrfs subvolume create /persist/services + * sudo mkdir -p /persist/root/.ssh + * sudo ssh-keygen -t ed25519 -f /persist/root/.ssh/btrfs-replication -N "" -C "root@$(hostname)-replication" + * Get the public key: sudo cat /persist/root/.ssh/btrfs-replication.pub + Then add this public key to each standby's nfsServicesStandby.replicationKeys option + +NFS services standby setup (one-time on each standby host, e.g. c1): + * sudo btrfs subvolume create /persist/services-standby + +Moving NFS server role between hosts (e.g. from zippy to c1): + See docs/NFS_FAILOVER.md for detailed procedure + Summary: + 1. On current primary: create final snapshot and send to new primary + 2. On new primary: promote snapshot to /persist/services + 3. Update configs: remove nfs-services-server.nix from old primary, add to new primary + 4. Update configs: add nfs-services-standby.nix to old primary (with replication keys) + 5. Deploy old primary first (to demote), then new primary (to promote)