alo-cluster/common/nomad.nix

# inspiration: https://github.com/astro/skyflake/blob/main/nixos-modules/nomad.nix
{ pkgs, config, lib, ... }:
let
  servers = [
    "c1"
    "c2"
    "c3"
  ];
in
{
  options.clusterRole.nomadServer = lib.mkEnableOption "Nomad server mode";

  config = {
    services.nomad = {
      enable = true;
      # true breaks at least CSI volumes
      # TODO: consider fixing
      dropPrivileges = false;

      settings = {
        datacenter = "alo";

        client = {
          enabled = true;
          server_join.retry_join = servers;
          host_network.tailscale = {
            interface = "tailscale0";
            cidr = "100.64.0.0/10";
          };
          host_volume = {
            services = {
              path = "/data/services";
              read_only = false;
            };
            nix-store = {
              path = "/nix/store";
              read_only = true;
            };
            sw = {
              path = "/run/current-system/sw";
              read_only = true;
            };
          };
        };

        server = {
          enabled = config.clusterRole.nomadServer;
          bootstrap_expect = (builtins.length servers + 2) / 2;
          server_join.retry_join = servers;
        };

        telemetry = {
          collection_interval = "1s";
          disable_hostname = true;
          prometheus_metrics = true;
          publish_allocation_metrics = true;
          publish_node_metrics = true;
        };
      };

      extraSettingsPaths = [ "/etc/nomad-alo.json" ];
    };

    # NFS mount dependency configuration for Nomad:
    #
    # Problem: Docker bind mounts need the real NFS mount, not an empty stub.
    # If Nomad starts before NFS is mounted, containers get empty directories.
    #
    # Solution: Use soft dependencies (wants/after) with health-checking recovery.
    # - wants: Nomad wants the mount, but won't be killed if it goes away
    # - after: Nomad waits for mount to be attempted before starting
    # - ExecStartPre with findmnt: Blocks Nomad start until mount is actually active
    #
    # This prevents Docker race conditions while allowing:
    # - Boot to proceed if NFS unavailable (Nomad fails to start, systemd retries)
    # - Nomad to keep running if NFS temporarily fails (containers may error)
    # - Recovery service to auto-restart Nomad when NFS comes back or becomes stale
    #
    # Note: Mount uses Consul DNS which resolves at mount time. If NFS server
    #       moves to different IP, mount becomes stale and needs remount.
    #       The recovery service handles this by detecting stale mounts and restarting Nomad.
    systemd.services.nomad = {
      wants = [ "network-online.target" "data-services.mount" ];
      after = [ "data-services.mount" ];
      serviceConfig.ExecStartPre = "${pkgs.util-linux}/bin/findmnt --mountpoint /data/services";
    };

    # Recovery service: automatically restart Nomad when NFS mount needs attention
    # This handles scenarios where:
    # - NFS server was down during boot (mount failed, Nomad hit start-limit)
    # - NFS server failed over to different host with new IP (mount went stale)
    # - Network outage temporarily broke the mount
    #
    # The timer runs every 30s and checks:
    # 1. Is mount healthy (exists and accessible)?
    # 2. If mount is stale/inaccessible → restart Nomad (triggers remount)
    # 3. If mount is healthy but Nomad failed → restart Nomad (normal recovery)
    systemd.services.nomad-mount-watcher = {
      description = "Restart Nomad when NFS mount needs attention";
      serviceConfig = {
        Type = "oneshot";
        ExecStart = pkgs.writeShellScript "nomad-mount-watcher" ''
          # Check if mount point exists
          if ! ${pkgs.util-linux}/bin/findmnt --mountpoint /data/services >/dev/null 2>&1; then
            exit 0  # Mount not present, nothing to do
          fi

          # Check if mount is actually accessible (not stale)
          # Use timeout to avoid hanging on stale NFS mounts
          if ! ${pkgs.coreutils}/bin/timeout 5s ${pkgs.coreutils}/bin/stat /data/services >/dev/null 2>&1; then
            echo "NFS mount is stale or inaccessible. Restarting Nomad to trigger remount..."
            ${pkgs.systemd}/bin/systemctl restart nomad.service
            exit 0
          fi

          # Mount is healthy - check if Nomad needs recovery
          if ${pkgs.systemd}/bin/systemctl is-failed nomad.service >/dev/null 2>&1; then
            echo "NFS mount is healthy but Nomad is failed. Restarting Nomad..."
            ${pkgs.systemd}/bin/systemctl restart nomad.service
          fi
        '';
      };
    };

    systemd.timers.nomad-mount-watcher = {
      description = "Timer for Nomad mount watcher";
      wantedBy = [ "timers.target" ];
      timerConfig = {
        OnBootSec = "1min";       # First run 1min after boot
        OnUnitActiveSec = "30s";  # Then every 30s
        Unit = "nomad-mount-watcher.service";
      };
    };

    environment.etc."nomad-alo.json".text = builtins.toJSON {
      plugin.docker.config = {
        allow_privileged = true;
        # for keepalived, though only really needing "NET_ADMIN","NET_BROADCAST","NET_RAW" on top of default
        # TODO: trim this down
        allow_caps = [ "all" ];
        volumes.enabled = true;
        extra_labels = [
          "job_name"
          "task_group_name"
          "task_name"
          "node_name"
        ];
      };

      plugin.raw_exec.config.enabled = true;
    };

    environment.persistence.${config.custom.impermanence.persistPath}.directories = [
      "/var/lib/docker"
      "/var/lib/nomad"
    ];

    environment.systemPackages = with pkgs; [
      nomad
      wander
      damon
    ];

    networking.firewall = {
      allowedTCPPorts =
        if config.clusterRole.nomadServer then
          [
            4646
            4647
            4648
          ]
        else
          [ 4646 ];
      allowedUDPPorts = if config.clusterRole.nomadServer then [ 4648 ] else [ ];
    };
  };
}