Another attempt at fixing the NFS race.

Install killall everywhere.
2025-10-24 13:59:39 +01:00 · 2025-10-24 11:56:47 +01:00
3 changed files with 76 additions and 17 deletions
--- a/common/global/packages.nix
+++ b/common/global/packages.nix
@@ -3,6 +3,7 @@
  environment.systemPackages = with pkgs; [
    age
    file
+    killall
    lm_sensors # TODO: this shouldn't be installed on cloud nodes
    nodejs_20 # TODO: this is for one job on nomad, it should just be a dependency there
    neovim
--- a/common/nfs-services-client.nix
+++ b/common/nfs-services-client.nix
@@ -4,14 +4,17 @@
  # Mounts from data-services.service.consul (Consul DNS for automatic failover)
  # The NFS server registers itself in Consul, so this will automatically
  # point to whichever host is currently running the NFS server
+  #
+  # Uses persistent mount (not automount) with nofail to prevent blocking boot.
+  # The mount is established at boot time and persists - no auto-unmount.
+  # This prevents issues with Docker bind mounts seeing empty automount stubs.

  fileSystems."/data/services" = {
    device = "data-services.service.consul:/persist/services";
    fsType = "nfs";
    options = [
-      "x-systemd.automount"  # Auto-mount on access
-      "noauto"               # Don't mount at boot (automount handles it)
-      "x-systemd.idle-timeout=60"  # Unmount after 60s of inactivity
+      "nofail"                      # Don't block boot if mount fails
+      "x-systemd.mount-timeout=30s" # Timeout for mount attempts
      "_netdev"                     # Network filesystem (wait for network)
    ];
  };
--- a/common/nomad.nix
+++ b/common/nomad.nix
@@ -59,20 +59,75 @@ in
    extraSettingsPaths = [ "/etc/nomad-alo.json" ];
  };

-  # Fix race condition between NFS automount and Docker bind mounts:
-  # Without this, Docker can bind-mount the empty automount stub directory
-  # before NFS actually mounts, causing permission errors and missing data.
-  # - RequiresMountsFor: tells systemd that Nomad depends on /data/services
-  # - ExecStartPre: triggers the automount before Nomad starts
-  # Note: boot will still succeed if NFS is unavailable (Nomad just won't start)
-  # TODO: NFS mount uses Consul DNS which resolves to an IP at mount time.
-  #       If the NFS server moves to a different IP, the mount becomes stale
-  #       and needs to be remounted. Consider using a VIP or implementing
-  #       a health check that remounts on staleness detection.
+  # NFS mount dependency configuration for Nomad:
+  #
+  # Problem: Docker bind mounts need the real NFS mount, not an empty stub.
+  # If Nomad starts before NFS is mounted, containers get empty directories.
+  #
+  # Solution: Use soft dependencies (wants/after) with health-checking recovery.
+  # - wants: Nomad wants the mount, but won't be killed if it goes away
+  # - after: Nomad waits for mount to be attempted before starting
+  # - ExecStartPre with findmnt: Blocks Nomad start until mount is actually active
+  #
+  # This prevents Docker race conditions while allowing:
+  # - Boot to proceed if NFS unavailable (Nomad fails to start, systemd retries)
+  # - Nomad to keep running if NFS temporarily fails (containers may error)
+  # - Recovery service to auto-restart Nomad when NFS comes back or becomes stale
+  #
+  # Note: Mount uses Consul DNS which resolves at mount time. If NFS server
+  #       moves to different IP, mount becomes stale and needs remount.
+  #       The recovery service handles this by detecting stale mounts and restarting Nomad.
  systemd.services.nomad = {
-    wants = [ "network-online.target" ];
-    unitConfig.RequiresMountsFor = [ "/data/services" ];
-    serviceConfig.ExecStartPre = "${pkgs.coreutils}/bin/ls /data/services";
+    wants = [ "network-online.target" "data-services.mount" ];
+    after = [ "data-services.mount" ];
+    serviceConfig.ExecStartPre = "${pkgs.util-linux}/bin/findmnt --mountpoint /data/services";
+  };
+
+  # Recovery service: automatically restart Nomad when NFS mount needs attention
+  # This handles scenarios where:
+  # - NFS server was down during boot (mount failed, Nomad hit start-limit)
+  # - NFS server failed over to different host with new IP (mount went stale)
+  # - Network outage temporarily broke the mount
+  #
+  # The timer runs every 30s and checks:
+  # 1. Is mount healthy (exists and accessible)?
+  # 2. If mount is stale/inaccessible → restart Nomad (triggers remount)
+  # 3. If mount is healthy but Nomad failed → restart Nomad (normal recovery)
+  systemd.services.nomad-mount-watcher = {
+    description = "Restart Nomad when NFS mount needs attention";
+    serviceConfig = {
+      Type = "oneshot";
+      ExecStart = pkgs.writeShellScript "nomad-mount-watcher" ''
+        # Check if mount point exists
+        if ! ${pkgs.util-linux}/bin/findmnt --mountpoint /data/services >/dev/null 2>&1; then
+          exit 0  # Mount not present, nothing to do
+        fi
+
+        # Check if mount is actually accessible (not stale)
+        # Use timeout to avoid hanging on stale NFS mounts
+        if ! ${pkgs.coreutils}/bin/timeout 5s ${pkgs.coreutils}/bin/stat /data/services >/dev/null 2>&1; then
+          echo "NFS mount is stale or inaccessible. Restarting Nomad to trigger remount..."
+          ${pkgs.systemd}/bin/systemctl restart nomad.service
+          exit 0
+        fi
+
+        # Mount is healthy - check if Nomad needs recovery
+        if ${pkgs.systemd}/bin/systemctl is-failed nomad.service >/dev/null 2>&1; then
+          echo "NFS mount is healthy but Nomad is failed. Restarting Nomad..."
+          ${pkgs.systemd}/bin/systemctl restart nomad.service
+        fi
+      '';
+    };
+  };
+
+  systemd.timers.nomad-mount-watcher = {
+    description = "Timer for Nomad mount watcher";
+    wantedBy = [ "timers.target" ];
+    timerConfig = {
+      OnBootSec = "1min";       # First run 1min after boot
+      OnUnitActiveSec = "30s";  # Then every 30s
+      Unit = "nomad-mount-watcher.service";
+    };
  };

  environment.etc."nomad-alo.json".text = builtins.toJSON {
Author	SHA1	Message	Date
Petru Paler	cf2210ec77	Another attempt at fixing the NFS race.	2025-10-24 13:59:39 +01:00
Petru Paler	1dc219d08f	Install killall everywhere.	2025-10-24 11:56:47 +01:00