# inspiration: https://github.com/astro/skyflake/blob/main/nixos-modules/nomad.nix { pkgs, config, ... }: let servers = [ "c1" "c2" "c3" ]; server_enabled = builtins.elem config.networking.hostName servers; in { services.nomad = { enable = true; # true breaks at least CSI volumes # TODO: consider fixing dropPrivileges = false; settings = { datacenter = "alo"; client = { enabled = true; server_join.retry_join = servers; host_network.tailscale = { interface = "tailscale0"; cidr = "100.64.0.0/10"; }; host_volume = { services = { path = "/data/services"; read_only = false; }; nix-store = { path = "/nix/store"; read_only = true; }; sw = { path = "/run/current-system/sw"; read_only = true; }; }; }; server = { enabled = server_enabled; bootstrap_expect = (builtins.length servers + 2) / 2; server_join.retry_join = servers; }; telemetry = { collection_interval = "1s"; disable_hostname = true; prometheus_metrics = true; publish_allocation_metrics = true; publish_node_metrics = true; }; }; extraSettingsPaths = [ "/etc/nomad-alo.json" ]; }; # Fix race condition between NFS automount and Docker bind mounts: # Without this, Docker can bind-mount the empty automount stub directory # before NFS actually mounts, causing permission errors and missing data. # - RequiresMountsFor: tells systemd that Nomad depends on /data/services # - ExecStartPre: triggers the automount before Nomad starts # Note: boot will still succeed if NFS is unavailable (Nomad just won't start) # TODO: NFS mount uses Consul DNS which resolves to an IP at mount time. # If the NFS server moves to a different IP, the mount becomes stale # and needs to be remounted. Consider using a VIP or implementing # a health check that remounts on staleness detection. systemd.services.nomad = { wants = [ "network-online.target" ]; unitConfig.RequiresMountsFor = [ "/data/services" ]; serviceConfig.ExecStartPre = "${pkgs.coreutils}/bin/ls /data/services"; }; environment.etc."nomad-alo.json".text = builtins.toJSON { plugin.docker.config = { allow_privileged = true; # for keepalived, though only really needing "NET_ADMIN","NET_BROADCAST","NET_RAW" on top of default # TODO: trim this down allow_caps = [ "all" ]; volumes.enabled = true; extra_labels = [ "job_name" "task_group_name" "task_name" "node_name" ]; }; plugin.raw_exec.config.enabled = true; }; environment.persistence."/persist".directories = [ "/var/lib/docker" "/var/lib/nomad" ]; environment.systemPackages = with pkgs; [ nomad wander damon ]; networking.firewall = { allowedTCPPorts = if server_enabled then [ 4646 4647 4648 ] else [ 4646 ]; allowedUDPPorts = if server_enabled then [ 4648 ] else [ ]; }; }