diff --git a/common/cluster-member.nix b/common/cluster-member.nix index b7ecb6f..2fd453a 100644 --- a/common/cluster-member.nix +++ b/common/cluster-member.nix @@ -11,4 +11,8 @@ ./glusterfs-client.nix # Keep during migration, will be removed in Phase 3 ./nfs-services-client.nix # New: NFS client for /data/services ]; + + # Wait for eno1 to be routable before considering network online + # (hosts with different primary interfaces should override this) + systemd.network.wait-online.extraArgs = [ "--interface=eno1:routable" ]; } diff --git a/common/nfs-services-client.nix b/common/nfs-services-client.nix index 77c885f..312d68a 100644 --- a/common/nfs-services-client.nix +++ b/common/nfs-services-client.nix @@ -9,12 +9,17 @@ # The mount is established at boot time and persists - no auto-unmount. # This prevents issues with Docker bind mounts seeing empty automount stubs. + imports = [ + ./wait-for-dns-ready.nix + ]; + fileSystems."/data/services" = { device = "data-services.service.consul:/persist/services"; fsType = "nfs"; options = [ "nofail" # Don't block boot if mount fails "x-systemd.mount-timeout=30s" # Timeout for mount attempts + "x-systemd.after=wait-for-dns-ready.service" # Wait for DNS to actually work "_netdev" # Network filesystem (wait for network) ]; }; diff --git a/common/wait-for-dns-ready.nix b/common/wait-for-dns-ready.nix new file mode 100644 index 0000000..04293f5 --- /dev/null +++ b/common/wait-for-dns-ready.nix @@ -0,0 +1,55 @@ +{ pkgs, ... }: +{ + # Service to wait for DNS resolution to be actually functional + # This is needed because network-online.target and wait-online.service + # don't guarantee DNS works - they only check that interfaces are configured. + # + # Problem: NFS mounts using Consul DNS names (data-services.service.consul) + # fail at boot because DNS resolution isn't ready even though network is "online" + # + # Solution: Actively test DNS resolution before considering network truly ready + + systemd.services.wait-for-dns-ready = { + description = "Wait for DNS resolution to be functional"; + after = [ + "systemd-networkd-wait-online.service" + "systemd-resolved.service" + "network-online.target" + ]; + wants = [ "network-online.target" ]; + wantedBy = [ "multi-user.target" ]; + + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + ExecStart = pkgs.writeShellScript "wait-for-dns-ready" '' + # Test DNS resolution by attempting to resolve data-services.service.consul + # This ensures the full DNS path works: interface → gateway → Consul DNS + + echo "Waiting for DNS resolution to be ready..." + + for i in {1..30}; do + # Use getent which respects /etc/nsswitch.conf and systemd-resolved + if ${pkgs.glibc.bin}/bin/getent hosts data-services.service.consul >/dev/null 2>&1; then + echo "DNS ready: data-services.service.consul resolved successfully" + exit 0 + fi + + # Also test a public DNS name to distinguish between general DNS failure + # vs Consul-specific issues (helpful for debugging) + if ! ${pkgs.glibc.bin}/bin/getent hosts www.google.com >/dev/null 2>&1; then + echo "Attempt $i/30: General DNS not working yet, waiting..." + else + echo "Attempt $i/30: General DNS works but Consul DNS not ready yet, waiting..." + fi + + sleep 1 + done + + echo "Warning: DNS not fully ready after 30 seconds" + echo "NFS mounts with 'nofail' option will handle this gracefully" + exit 0 # Don't block boot - let nofail mounts handle DNS failures + ''; + }; + }; +} diff --git a/hosts/chilly/default.nix b/hosts/chilly/default.nix index 698bfe5..7098cc9 100644 --- a/hosts/chilly/default.nix +++ b/hosts/chilly/default.nix @@ -25,6 +25,8 @@ networking.useNetworkd = true; systemd.network.enable = true; + # Wait for br0 to be routable before considering network online + systemd.network.wait-online.extraArgs = [ "--interface=br0:routable" ]; # not useful and potentially a security loophole services.resolved.llmnr = "false";