alo-cluster/common/nomad.nix

# inspiration: https://github.com/astro/skyflake/blob/main/nixos-modules/nomad.nix
{ pkgs, config, ... }:
let
  servers = [
    "c1"
    "c2"
    "c3"
  ];
  server_enabled = builtins.elem config.networking.hostName servers;
in
{
  services.nomad = {
    enable = true;
    # true breaks at least CSI volumes
    # TODO: consider fixing
    dropPrivileges = false;

    settings = {
      datacenter = "alo";

      client = {
        enabled = true;
        server_join.retry_join = servers;
        host_network.tailscale = {
          interface = "tailscale0";
          cidr = "100.64.0.0/10";
        };
        host_volume = {
          services = {
            path = "/data/services";
            read_only = false;
          };
          nix-store = {
            path = "/nix/store";
            read_only = true;
          };
          sw = {
            path = "/run/current-system/sw";
            read_only = true;
          };
        };
      };

      server = {
        enabled = server_enabled;
        bootstrap_expect = (builtins.length servers + 2) / 2;
        server_join.retry_join = servers;
      };

      telemetry = {
        collection_interval = "1s";
        disable_hostname = true;
        prometheus_metrics = true;
        publish_allocation_metrics = true;
        publish_node_metrics = true;
      };
    };

    extraSettingsPaths = [ "/etc/nomad-alo.json" ];
  };

  # NFS mount dependency configuration for Nomad:
  #
  # Problem: Docker bind mounts need the real NFS mount, not an empty stub.
  # If Nomad starts before NFS is mounted, containers get empty directories.
  #
  # Solution: Use soft dependencies (wants/after) with health-checking recovery.
  # - wants: Nomad wants the mount, but won't be killed if it goes away
  # - after: Nomad waits for mount to be attempted before starting
  # - ExecStartPre with findmnt: Blocks Nomad start until mount is actually active
  #
  # This prevents Docker race conditions while allowing:
  # - Boot to proceed if NFS unavailable (Nomad fails to start, systemd retries)
  # - Nomad to keep running if NFS temporarily fails (containers may error)
  # - Recovery service to auto-restart Nomad when NFS comes back or becomes stale
  #
  # Note: Mount uses Consul DNS which resolves at mount time. If NFS server
  #       moves to different IP, mount becomes stale and needs remount.
  #       The recovery service handles this by detecting stale mounts and restarting Nomad.
  systemd.services.nomad = {
    wants = [ "network-online.target" "data-services.mount" ];
    after = [ "data-services.mount" ];
    serviceConfig.ExecStartPre = "${pkgs.util-linux}/bin/findmnt --mountpoint /data/services";
  };

  # Recovery service: automatically restart Nomad when NFS mount needs attention
  # This handles scenarios where:
  # - NFS server was down during boot (mount failed, Nomad hit start-limit)
  # - NFS server failed over to different host with new IP (mount went stale)
  # - Network outage temporarily broke the mount
  #
  # The timer runs every 30s and checks:
  # 1. Is mount healthy (exists and accessible)?
  # 2. If mount is stale/inaccessible → restart Nomad (triggers remount)
  # 3. If mount is healthy but Nomad failed → restart Nomad (normal recovery)
  systemd.services.nomad-mount-watcher = {
    description = "Restart Nomad when NFS mount needs attention";
    serviceConfig = {
      Type = "oneshot";
      ExecStart = pkgs.writeShellScript "nomad-mount-watcher" ''
        # Check if mount point exists
        if ! ${pkgs.util-linux}/bin/findmnt --mountpoint /data/services >/dev/null 2>&1; then
          exit 0  # Mount not present, nothing to do
        fi

        # Check if mount is actually accessible (not stale)
        # Use timeout to avoid hanging on stale NFS mounts
        if ! ${pkgs.coreutils}/bin/timeout 5s ${pkgs.coreutils}/bin/stat /data/services >/dev/null 2>&1; then
          echo "NFS mount is stale or inaccessible. Restarting Nomad to trigger remount..."
          ${pkgs.systemd}/bin/systemctl restart nomad.service
          exit 0
        fi

        # Mount is healthy - check if Nomad needs recovery
        if ${pkgs.systemd}/bin/systemctl is-failed nomad.service >/dev/null 2>&1; then
          echo "NFS mount is healthy but Nomad is failed. Restarting Nomad..."
          ${pkgs.systemd}/bin/systemctl restart nomad.service
        fi
      '';
    };
  };

  systemd.timers.nomad-mount-watcher = {
    description = "Timer for Nomad mount watcher";
    wantedBy = [ "timers.target" ];
    timerConfig = {
      OnBootSec = "1min";       # First run 1min after boot
      OnUnitActiveSec = "30s";  # Then every 30s
      Unit = "nomad-mount-watcher.service";
    };
  };

  environment.etc."nomad-alo.json".text = builtins.toJSON {
    plugin.docker.config = {
      allow_privileged = true;
      # for keepalived, though only really needing "NET_ADMIN","NET_BROADCAST","NET_RAW" on top of default
      # TODO: trim this down
      allow_caps = [ "all" ];
      volumes.enabled = true;
      extra_labels = [
        "job_name"
        "task_group_name"
        "task_name"
        "node_name"
      ];
    };

    plugin.raw_exec.config.enabled = true;
  };

  environment.persistence."/persist".directories = [
    "/var/lib/docker"
    "/var/lib/nomad"
  ];

  environment.systemPackages = with pkgs; [
    nomad
    wander
    damon
  ];

  networking.firewall = {
    allowedTCPPorts =
      if server_enabled then
        [
          4646
          4647
          4648
        ]
      else
        [ 4646 ];
    allowedUDPPorts = if server_enabled then [ 4648 ] else [ ];
  };
}