diff --git a/common/netconsole-receiver.nix b/common/netconsole-receiver.nix new file mode 100644 index 0000000..3291df4 --- /dev/null +++ b/common/netconsole-receiver.nix @@ -0,0 +1,32 @@ +{ + config, + lib, + pkgs, + ... +}: +{ + options.services.netconsoleReceiver = { + enable = lib.mkEnableOption "netconsole UDP receiver"; + port = lib.mkOption { + type = lib.types.port; + default = 6666; + description = "UDP port to listen on for netconsole messages"; + }; + }; + + config = lib.mkIf config.services.netconsoleReceiver.enable { + systemd.services.netconsole-receiver = { + description = "Netconsole UDP receiver"; + wantedBy = [ "multi-user.target" ]; + after = [ "network.target" ]; + serviceConfig = { + ExecStart = "${pkgs.socat}/bin/socat -u UDP-LISTEN:${toString config.services.netconsoleReceiver.port},fork STDOUT"; + StandardOutput = "journal"; + StandardError = "journal"; + SyslogIdentifier = "netconsole"; + Restart = "always"; + RestartSec = "5s"; + }; + }; + }; +} diff --git a/hosts/beefy/default.nix b/hosts/beefy/default.nix index a912fa7..0c65a3c 100644 --- a/hosts/beefy/default.nix +++ b/hosts/beefy/default.nix @@ -1,4 +1,4 @@ -{ pkgs, inputs, ... }: +{ pkgs, inputs, config, ... }: { imports = [ ../../common/encrypted-btrfs-layout.nix @@ -19,6 +19,38 @@ networking.cluster.primaryInterface = "enp1s0"; services.tailscaleAutoconnect.authkey = "tskey-auth-k79UsDTw2v11CNTRL-oYqji35BE9c7CqM89Dzs9cBF14PmqYsi"; - # Enable all SysRq functions for debugging hangs - boot.kernel.sysctl."kernel.sysrq" = 1; + # Console blanking after 5 minutes (for greeter display sleep) + # NMI watchdog for hardlockup detection + boot.kernelParams = [ "consoleblank=300" "nmi_watchdog=1" ]; + + # Netconsole - stream kernel messages to zippy (192.168.1.2) + boot.kernelModules = [ "netconsole" ]; + boot.extraModprobeConfig = '' + options netconsole netconsole=@/enp1s0,6666@192.168.1.2/c0:3f:d5:62:55:bb + ''; + + # Kdump for kernel crash analysis + boot.crashDump = { + enable = true; + reservedMemory = "256M"; + }; + + # Lockup detectors - panic on detection so kdump captures state + boot.kernel.sysctl = { + # Enable all SysRq functions for debugging hangs + "kernel.sysrq" = 1; + # Panic on soft lockup (CPU not scheduling for >20s) + "kernel.softlockup_panic" = 1; + # Panic on hung tasks (blocked >120s) + "kernel.hung_task_panic" = 1; + "kernel.hung_task_timeout_secs" = 120; + }; + + # Persist crash dumps + environment.persistence.${config.custom.impermanence.persistPath}.directories = [ + "/var/crash" + ]; + + # Crash analysis tools + environment.systemPackages = with pkgs; [ crash makedumpfile ]; } diff --git a/hosts/zippy/default.nix b/hosts/zippy/default.nix index f817d98..a8f5d0b 100644 --- a/hosts/zippy/default.nix +++ b/hosts/zippy/default.nix @@ -5,9 +5,13 @@ ../../common/global ../../common/cluster-member.nix # Consul + storage clients ../../common/nomad-worker.nix # Nomad client (runs jobs) + ../../common/netconsole-receiver.nix ./hardware.nix ]; + # Receive kernel messages from beefy via netconsole + services.netconsoleReceiver.enable = true; + diskLayout = { mainDiskDevice = "/dev/disk/by-id/ata-KINGSTON_SKC600MS1024G_50026B7785AE0A92"; #keyDiskDevice = "/dev/disk/by-id/usb-Intenso_Micro_Line_22080777660702-0:0";