Make sure DNS is up before mounting NFS.

This commit is contained in:
2025-10-24 22:49:32 +01:00
parent a7dce7cfb9
commit a57fc9107b
4 changed files with 66 additions and 0 deletions

View File

@@ -11,4 +11,8 @@
./glusterfs-client.nix # Keep during migration, will be removed in Phase 3
./nfs-services-client.nix # New: NFS client for /data/services
];
# Wait for eno1 to be routable before considering network online
# (hosts with different primary interfaces should override this)
systemd.network.wait-online.extraArgs = [ "--interface=eno1:routable" ];
}

View File

@@ -9,12 +9,17 @@
# The mount is established at boot time and persists - no auto-unmount.
# This prevents issues with Docker bind mounts seeing empty automount stubs.
imports = [
./wait-for-dns-ready.nix
];
fileSystems."/data/services" = {
device = "data-services.service.consul:/persist/services";
fsType = "nfs";
options = [
"nofail" # Don't block boot if mount fails
"x-systemd.mount-timeout=30s" # Timeout for mount attempts
"x-systemd.after=wait-for-dns-ready.service" # Wait for DNS to actually work
"_netdev" # Network filesystem (wait for network)
];
};

View File

@@ -0,0 +1,55 @@
{ pkgs, ... }:
{
# Service to wait for DNS resolution to be actually functional
# This is needed because network-online.target and wait-online.service
# don't guarantee DNS works - they only check that interfaces are configured.
#
# Problem: NFS mounts using Consul DNS names (data-services.service.consul)
# fail at boot because DNS resolution isn't ready even though network is "online"
#
# Solution: Actively test DNS resolution before considering network truly ready
systemd.services.wait-for-dns-ready = {
description = "Wait for DNS resolution to be functional";
after = [
"systemd-networkd-wait-online.service"
"systemd-resolved.service"
"network-online.target"
];
wants = [ "network-online.target" ];
wantedBy = [ "multi-user.target" ];
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
ExecStart = pkgs.writeShellScript "wait-for-dns-ready" ''
# Test DNS resolution by attempting to resolve data-services.service.consul
# This ensures the full DNS path works: interface gateway Consul DNS
echo "Waiting for DNS resolution to be ready..."
for i in {1..30}; do
# Use getent which respects /etc/nsswitch.conf and systemd-resolved
if ${pkgs.glibc.bin}/bin/getent hosts data-services.service.consul >/dev/null 2>&1; then
echo "DNS ready: data-services.service.consul resolved successfully"
exit 0
fi
# Also test a public DNS name to distinguish between general DNS failure
# vs Consul-specific issues (helpful for debugging)
if ! ${pkgs.glibc.bin}/bin/getent hosts www.google.com >/dev/null 2>&1; then
echo "Attempt $i/30: General DNS not working yet, waiting..."
else
echo "Attempt $i/30: General DNS works but Consul DNS not ready yet, waiting..."
fi
sleep 1
done
echo "Warning: DNS not fully ready after 30 seconds"
echo "NFS mounts with 'nofail' option will handle this gracefully"
exit 0 # Don't block boot - let nofail mounts handle DNS failures
'';
};
};
}

View File

@@ -25,6 +25,8 @@
networking.useNetworkd = true;
systemd.network.enable = true;
# Wait for br0 to be routable before considering network online
systemd.network.wait-online.extraArgs = [ "--interface=br0:routable" ];
# not useful and potentially a security loophole
services.resolved.llmnr = "false";