# inspiration: https://github.com/astro/skyflake/blob/main/nixos-modules/nomad.nix { pkgs, config, lib, ... }: let servers = [ "c1" "c2" "c3" ]; in { options.clusterRole.nomadServer = lib.mkEnableOption "Nomad server mode"; config = { services.nomad = { enable = true; # true breaks at least CSI volumes # TODO: consider fixing dropPrivileges = false; settings = { datacenter = "alo"; client = { enabled = true; server_join.retry_join = servers; host_network.tailscale = { interface = "tailscale0"; cidr = "100.64.0.0/10"; }; host_volume = { services = { path = "/data/services"; read_only = false; }; nix-store = { path = "/nix/store"; read_only = true; }; sw = { path = "/run/current-system/sw"; read_only = true; }; }; }; server = { enabled = config.clusterRole.nomadServer; bootstrap_expect = (builtins.length servers + 2) / 2; server_join.retry_join = servers; }; telemetry = { collection_interval = "1s"; disable_hostname = true; prometheus_metrics = true; publish_allocation_metrics = true; publish_node_metrics = true; }; }; extraSettingsPaths = [ "/etc/nomad-alo.json" ]; }; # NFS mount dependency configuration for Nomad: # # Problem: Docker bind mounts need the real NFS mount, not an empty stub. # If Nomad starts before NFS is mounted, containers get empty directories. # # Solution: Use soft dependencies (wants/after) with health-checking recovery. # - wants: Nomad wants the mount, but won't be killed if it goes away # - after: Nomad waits for mount to be attempted before starting # - ExecStartPre with findmnt: Blocks Nomad start until mount is actually active # # This prevents Docker race conditions while allowing: # - Boot to proceed if NFS unavailable (Nomad fails to start, systemd retries) # - Nomad to keep running if NFS temporarily fails (containers may error) # - Recovery service to auto-restart Nomad when NFS comes back or becomes stale # # Note: Mount uses Consul DNS which resolves at mount time. If NFS server # moves to different IP, mount becomes stale and needs remount. # The recovery service handles this by detecting stale mounts and restarting Nomad. systemd.services.nomad = { wants = [ "network-online.target" "data-services.mount" ]; after = [ "data-services.mount" ]; serviceConfig.ExecStartPre = "${pkgs.util-linux}/bin/findmnt --mountpoint /data/services"; }; # Recovery service: automatically restart Nomad when NFS mount needs attention # This handles scenarios where: # - NFS server was down during boot (mount failed, Nomad hit start-limit) # - NFS server failed over to different host with new IP (mount went stale) # - Network outage temporarily broke the mount # # The timer runs every 30s and checks: # 1. Is mount healthy (exists and accessible)? # 2. If mount is stale/inaccessible → restart Nomad (triggers remount) # 3. If mount is healthy but Nomad failed → restart Nomad (normal recovery) systemd.services.nomad-mount-watcher = { description = "Restart Nomad when NFS mount needs attention"; serviceConfig = { Type = "oneshot"; ExecStart = pkgs.writeShellScript "nomad-mount-watcher" '' # Check if mount point exists if ! ${pkgs.util-linux}/bin/findmnt --mountpoint /data/services >/dev/null 2>&1; then exit 0 # Mount not present, nothing to do fi # Check if mount is actually accessible (not stale) # Use timeout to avoid hanging on stale NFS mounts if ! ${pkgs.coreutils}/bin/timeout 5s ${pkgs.coreutils}/bin/stat /data/services >/dev/null 2>&1; then echo "NFS mount is stale or inaccessible. Restarting Nomad to trigger remount..." ${pkgs.systemd}/bin/systemctl restart nomad.service exit 0 fi # Mount is healthy - check if Nomad needs recovery if ${pkgs.systemd}/bin/systemctl is-failed nomad.service >/dev/null 2>&1; then echo "NFS mount is healthy but Nomad is failed. Restarting Nomad..." ${pkgs.systemd}/bin/systemctl restart nomad.service fi ''; }; }; systemd.timers.nomad-mount-watcher = { description = "Timer for Nomad mount watcher"; wantedBy = [ "timers.target" ]; timerConfig = { OnBootSec = "1min"; # First run 1min after boot OnUnitActiveSec = "30s"; # Then every 30s Unit = "nomad-mount-watcher.service"; }; }; environment.etc."nomad-alo.json".text = builtins.toJSON { plugin.docker.config = { allow_privileged = true; # for keepalived, though only really needing "NET_ADMIN","NET_BROADCAST","NET_RAW" on top of default # TODO: trim this down allow_caps = [ "all" ]; volumes.enabled = true; extra_labels = [ "job_name" "task_group_name" "task_name" "node_name" ]; }; plugin.raw_exec.config.enabled = true; }; environment.persistence."/persist".directories = [ "/var/lib/docker" "/var/lib/nomad" ]; environment.systemPackages = with pkgs; [ nomad wander damon ]; networking.firewall = { allowedTCPPorts = if config.clusterRole.nomadServer then [ 4646 4647 4648 ] else [ 4646 ]; allowedUDPPorts = if config.clusterRole.nomadServer then [ 4648 ] else [ ]; }; }; }