From 58c851004d6af860529358ff2df1968d30e817db Mon Sep 17 00:00:00 2001 From: Petru Paler Date: Fri, 12 Dec 2025 07:23:33 +0000 Subject: [PATCH] Add stability debugging for beefy lockups. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add netconsole receiver on zippy to capture kernel messages - Configure beefy as netconsole sender to zippy (192.168.1.2) - Enable kdump with 256M reserved memory for crash analysis - Add lockup detectors (softlockup_panic, hung_task_panic, nmi_watchdog) - Add consoleblank=300 for greeter display sleep - Persist crash dumps and add analysis tools (crash, makedumpfile) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- common/netconsole-receiver.nix | 32 ++++++++++++++++++++++++++++ hosts/beefy/default.nix | 38 +++++++++++++++++++++++++++++++--- hosts/zippy/default.nix | 4 ++++ 3 files changed, 71 insertions(+), 3 deletions(-) create mode 100644 common/netconsole-receiver.nix diff --git a/common/netconsole-receiver.nix b/common/netconsole-receiver.nix new file mode 100644 index 0000000..3291df4 --- /dev/null +++ b/common/netconsole-receiver.nix @@ -0,0 +1,32 @@ +{ + config, + lib, + pkgs, + ... +}: +{ + options.services.netconsoleReceiver = { + enable = lib.mkEnableOption "netconsole UDP receiver"; + port = lib.mkOption { + type = lib.types.port; + default = 6666; + description = "UDP port to listen on for netconsole messages"; + }; + }; + + config = lib.mkIf config.services.netconsoleReceiver.enable { + systemd.services.netconsole-receiver = { + description = "Netconsole UDP receiver"; + wantedBy = [ "multi-user.target" ]; + after = [ "network.target" ]; + serviceConfig = { + ExecStart = "${pkgs.socat}/bin/socat -u UDP-LISTEN:${toString config.services.netconsoleReceiver.port},fork STDOUT"; + StandardOutput = "journal"; + StandardError = "journal"; + SyslogIdentifier = "netconsole"; + Restart = "always"; + RestartSec = "5s"; + }; + }; + }; +} diff --git a/hosts/beefy/default.nix b/hosts/beefy/default.nix index a912fa7..0c65a3c 100644 --- a/hosts/beefy/default.nix +++ b/hosts/beefy/default.nix @@ -1,4 +1,4 @@ -{ pkgs, inputs, ... }: +{ pkgs, inputs, config, ... }: { imports = [ ../../common/encrypted-btrfs-layout.nix @@ -19,6 +19,38 @@ networking.cluster.primaryInterface = "enp1s0"; services.tailscaleAutoconnect.authkey = "tskey-auth-k79UsDTw2v11CNTRL-oYqji35BE9c7CqM89Dzs9cBF14PmqYsi"; - # Enable all SysRq functions for debugging hangs - boot.kernel.sysctl."kernel.sysrq" = 1; + # Console blanking after 5 minutes (for greeter display sleep) + # NMI watchdog for hardlockup detection + boot.kernelParams = [ "consoleblank=300" "nmi_watchdog=1" ]; + + # Netconsole - stream kernel messages to zippy (192.168.1.2) + boot.kernelModules = [ "netconsole" ]; + boot.extraModprobeConfig = '' + options netconsole netconsole=@/enp1s0,6666@192.168.1.2/c0:3f:d5:62:55:bb + ''; + + # Kdump for kernel crash analysis + boot.crashDump = { + enable = true; + reservedMemory = "256M"; + }; + + # Lockup detectors - panic on detection so kdump captures state + boot.kernel.sysctl = { + # Enable all SysRq functions for debugging hangs + "kernel.sysrq" = 1; + # Panic on soft lockup (CPU not scheduling for >20s) + "kernel.softlockup_panic" = 1; + # Panic on hung tasks (blocked >120s) + "kernel.hung_task_panic" = 1; + "kernel.hung_task_timeout_secs" = 120; + }; + + # Persist crash dumps + environment.persistence.${config.custom.impermanence.persistPath}.directories = [ + "/var/crash" + ]; + + # Crash analysis tools + environment.systemPackages = with pkgs; [ crash makedumpfile ]; } diff --git a/hosts/zippy/default.nix b/hosts/zippy/default.nix index f817d98..a8f5d0b 100644 --- a/hosts/zippy/default.nix +++ b/hosts/zippy/default.nix @@ -5,9 +5,13 @@ ../../common/global ../../common/cluster-member.nix # Consul + storage clients ../../common/nomad-worker.nix # Nomad client (runs jobs) + ../../common/netconsole-receiver.nix ./hardware.nix ]; + # Receive kernel messages from beefy via netconsole + services.netconsoleReceiver.enable = true; + diskLayout = { mainDiskDevice = "/dev/disk/by-id/ata-KINGSTON_SKC600MS1024G_50026B7785AE0A92"; #keyDiskDevice = "/dev/disk/by-id/usb-Intenso_Micro_Line_22080777660702-0:0";