Files
alo-cluster/common/wait-for-dns-ready.nix

56 lines
2.1 KiB
Nix

{ pkgs, ... }:
{
# Service to wait for DNS resolution to be actually functional
# This is needed because network-online.target and wait-online.service
# don't guarantee DNS works - they only check that interfaces are configured.
#
# Problem: NFS mounts using Consul DNS names (data-services.service.consul)
# fail at boot because DNS resolution isn't ready even though network is "online"
#
# Solution: Actively test DNS resolution before considering network truly ready
systemd.services.wait-for-dns-ready = {
description = "Wait for DNS resolution to be functional";
after = [
"systemd-networkd-wait-online.service"
"systemd-resolved.service"
"network-online.target"
];
wants = [ "network-online.target" ];
wantedBy = [ "multi-user.target" ];
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
ExecStart = pkgs.writeShellScript "wait-for-dns-ready" ''
# Test DNS resolution by attempting to resolve data-services.service.consul
# This ensures the full DNS path works: interface gateway Consul DNS
echo "Waiting for DNS resolution to be ready..."
for i in {1..30}; do
# Use getent which respects /etc/nsswitch.conf and systemd-resolved
if ${pkgs.glibc.bin}/bin/getent hosts data-services.service.consul >/dev/null 2>&1; then
echo "DNS ready: data-services.service.consul resolved successfully"
exit 0
fi
# Also test a public DNS name to distinguish between general DNS failure
# vs Consul-specific issues (helpful for debugging)
if ! ${pkgs.glibc.bin}/bin/getent hosts www.google.com >/dev/null 2>&1; then
echo "Attempt $i/30: General DNS not working yet, waiting..."
else
echo "Attempt $i/30: General DNS works but Consul DNS not ready yet, waiting..."
fi
sleep 1
done
echo "Warning: DNS not fully ready after 30 seconds"
echo "NFS mounts with 'nofail' option will handle this gracefully"
exit 0 # Don't block boot - let nofail mounts handle DNS failures
'';
};
};
}