202 lines
7.9 KiB
Nix
202 lines
7.9 KiB
Nix
{ config, lib, pkgs, ... }:
|
|
|
|
let
|
|
cfg = config.nfsServicesServer;
|
|
in
|
|
{
|
|
options.nfsServicesServer = {
|
|
enable = lib.mkEnableOption "NFS services server" // { default = true; };
|
|
|
|
standbys = lib.mkOption {
|
|
type = lib.types.listOf lib.types.str;
|
|
default = [];
|
|
description = ''
|
|
List of standby hostnames to replicate to (e.g. ["c1"]).
|
|
|
|
Requires one-time setup on the NFS server:
|
|
sudo mkdir -p /persist/root/.ssh
|
|
sudo ssh-keygen -t ed25519 -f /persist/root/.ssh/btrfs-replication -N "" -C "root@$(hostname)-replication"
|
|
|
|
Then add the public key to each standby's nfsServicesStandby.replicationKeys option.
|
|
'';
|
|
};
|
|
};
|
|
|
|
config = lib.mkIf cfg.enable {
|
|
# Persist root SSH directory for replication key
|
|
environment.persistence.${config.custom.impermanence.persistPath} = {
|
|
directories = [
|
|
"/root/.ssh"
|
|
];
|
|
};
|
|
|
|
# Bind mount /persist/services to /data/services for local access
|
|
# This makes the path consistent with NFS clients
|
|
# Use mkForce to override the NFS client mount from cluster-node.nix
|
|
fileSystems."/data/services" = lib.mkForce {
|
|
device = "/persist/services";
|
|
fsType = "none";
|
|
options = [ "bind" ];
|
|
};
|
|
|
|
# Nomad node metadata: mark this as the primary storage node
|
|
# Jobs can constrain to ${meta.storage_role} = "primary"
|
|
services.nomad.settings.client.meta = {
|
|
storage_role = "primary";
|
|
};
|
|
|
|
# NFS server configuration
|
|
services.nfs.server = {
|
|
enable = true;
|
|
exports = ''
|
|
/persist/services 192.168.1.0/24(rw,sync,no_subtree_check,no_root_squash)
|
|
'';
|
|
};
|
|
|
|
# Consul service registration for NFS
|
|
services.consul.extraConfig.services = [{
|
|
name = "data-services";
|
|
port = 2049;
|
|
checks = [{
|
|
tcp = "localhost:2049";
|
|
interval = "30s";
|
|
}];
|
|
}];
|
|
|
|
# Firewall for NFS
|
|
networking.firewall.allowedTCPPorts = [ 2049 111 20048 ];
|
|
networking.firewall.allowedUDPPorts = [ 2049 111 20048 ];
|
|
|
|
# systemd services: NFS server split-brain check + replication services
|
|
systemd.services = lib.mkMerge ([
|
|
# Safety check: prevent split-brain by ensuring no other NFS server is active
|
|
{
|
|
nfs-server = {
|
|
preStart = ''
|
|
# Wait for Consul to be available
|
|
for i in {1..30}; do
|
|
if ${pkgs.netcat}/bin/nc -z localhost 8600; then
|
|
break
|
|
fi
|
|
echo "Waiting for Consul DNS... ($i/30)"
|
|
sleep 1
|
|
done
|
|
|
|
# Check if another NFS server is already registered in Consul
|
|
CURRENT_SERVER=$(${pkgs.dnsutils}/bin/dig +short @localhost -p 8600 data-services.service.consul | head -1 || true)
|
|
MY_IP=$(${pkgs.iproute2}/bin/ip -4 addr show | ${pkgs.gnugrep}/bin/grep -oP '(?<=inet\s)\d+(\.\d+){3}' | ${pkgs.gnugrep}/bin/grep -v '^127\.' | head -1)
|
|
|
|
if [ -n "$CURRENT_SERVER" ] && [ "$CURRENT_SERVER" != "$MY_IP" ]; then
|
|
echo "ERROR: Another NFS server is already active at $CURRENT_SERVER"
|
|
echo "This host ($MY_IP) is configured as NFS server but should be standby."
|
|
echo "To fix:"
|
|
echo " 1. If this is intentional (failback), first demote the other server"
|
|
echo " 2. Update this host's config to use nfs-services-standby.nix instead"
|
|
echo " 3. Sync data from active server before promoting this host"
|
|
exit 1
|
|
fi
|
|
|
|
echo "NFS server startup check passed (no other active server found)"
|
|
'';
|
|
};
|
|
}
|
|
] ++ (lib.forEach cfg.standbys (standby: {
|
|
"replicate-services-to-${standby}" = {
|
|
description = "Replicate /persist/services to ${standby}";
|
|
path = [ pkgs.btrfs-progs pkgs.openssh pkgs.coreutils pkgs.findutils pkgs.gnugrep pkgs.curl ];
|
|
|
|
script = ''
|
|
set -euo pipefail
|
|
|
|
START_TIME=$(date +%s)
|
|
REPLICATION_SUCCESS=0
|
|
|
|
SSH_KEY="/persist/root/.ssh/btrfs-replication"
|
|
if [ ! -f "$SSH_KEY" ]; then
|
|
echo "ERROR: SSH key not found at $SSH_KEY"
|
|
echo "Run: sudo ssh-keygen -t ed25519 -f $SSH_KEY -N \"\" -C \"root@$(hostname)-replication\""
|
|
exit 1
|
|
fi
|
|
|
|
SNAPSHOT_NAME="services@$(date +%Y%m%d-%H%M%S)"
|
|
SNAPSHOT_PATH="/persist/$SNAPSHOT_NAME"
|
|
|
|
# Create readonly snapshot
|
|
btrfs subvolume snapshot -r /persist/services "$SNAPSHOT_PATH"
|
|
|
|
# Find previous snapshot on sender (sort by name since readonly snapshots have same mtime)
|
|
# Use -d to list directories only, not their contents
|
|
PREV_LOCAL=$(ls -1d /persist/services@* 2>/dev/null | grep -v "^$SNAPSHOT_PATH$" | sort -r | head -1 || true)
|
|
|
|
# Try incremental send if we have a parent, fall back to full send if it fails
|
|
if [ -n "$PREV_LOCAL" ]; then
|
|
echo "Attempting incremental send from $(basename $PREV_LOCAL) to ${standby}"
|
|
|
|
# Try incremental send, if it fails (e.g., parent missing on receiver), fall back to full
|
|
# Use -c to help with broken Received UUID chains
|
|
if btrfs send -p "$PREV_LOCAL" -c "$PREV_LOCAL" "$SNAPSHOT_PATH" | \
|
|
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
|
|
"btrfs receive /persist/services-standby"; then
|
|
echo "Incremental send completed successfully"
|
|
REPLICATION_SUCCESS=1
|
|
else
|
|
echo "Incremental send failed (likely missing parent on receiver), falling back to full send"
|
|
# Plain full send without clone source (receiver may have no snapshots)
|
|
btrfs send "$SNAPSHOT_PATH" | \
|
|
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
|
|
"btrfs receive /persist/services-standby"
|
|
REPLICATION_SUCCESS=1
|
|
fi
|
|
else
|
|
# First snapshot, do full send
|
|
echo "Full send to ${standby} (first snapshot)"
|
|
btrfs send "$SNAPSHOT_PATH" | \
|
|
ssh -i "$SSH_KEY" -o StrictHostKeyChecking=accept-new root@${standby} \
|
|
"btrfs receive /persist/services-standby"
|
|
REPLICATION_SUCCESS=1
|
|
fi
|
|
|
|
# Cleanup old snapshots on sender (keep last 10 snapshots, sorted by name/timestamp)
|
|
ls -1d /persist/services@* 2>/dev/null | sort | head -n -10 | xargs -r btrfs subvolume delete
|
|
|
|
# Calculate metrics
|
|
END_TIME=$(date +%s)
|
|
DURATION=$((END_TIME - START_TIME))
|
|
SNAPSHOT_COUNT=$(ls -1d /persist/services@* 2>/dev/null | wc -l)
|
|
|
|
# Push metrics to Prometheus pushgateway
|
|
cat <<METRICS | curl -s --data-binary @- http://pushgateway.service.consul:9091/metrics/job/nfs_replication/instance/${standby} || true
|
|
# TYPE nfs_replication_last_success_timestamp gauge
|
|
nfs_replication_last_success_timestamp $END_TIME
|
|
# TYPE nfs_replication_duration_seconds gauge
|
|
nfs_replication_duration_seconds $DURATION
|
|
# TYPE nfs_replication_snapshot_count gauge
|
|
nfs_replication_snapshot_count $SNAPSHOT_COUNT
|
|
# TYPE nfs_replication_success gauge
|
|
nfs_replication_success $REPLICATION_SUCCESS
|
|
METRICS
|
|
'';
|
|
|
|
serviceConfig = {
|
|
Type = "oneshot";
|
|
User = "root";
|
|
};
|
|
};
|
|
}))
|
|
);
|
|
|
|
systemd.timers = lib.mkMerge (
|
|
lib.forEach cfg.standbys (standby: {
|
|
"replicate-services-to-${standby}" = {
|
|
description = "Timer for replicating /persist/services to ${standby}";
|
|
wantedBy = [ "timers.target" ];
|
|
timerConfig = {
|
|
OnCalendar = "*:0/5"; # Every 5 minutes
|
|
Persistent = true;
|
|
};
|
|
};
|
|
})
|
|
);
|
|
};
|
|
}
|