Add stability debugging for beefy lockups.

- Add netconsole receiver on zippy to capture kernel messages
- Configure beefy as netconsole sender to zippy (192.168.1.2)
- Enable kdump with 256M reserved memory for crash analysis
- Add lockup detectors (softlockup_panic, hung_task_panic, nmi_watchdog)
- Add consoleblank=300 for greeter display sleep
- Persist crash dumps and add analysis tools (crash, makedumpfile)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-12-12 07:23:33 +00:00
parent bd889902be
commit 58c851004d
3 changed files with 71 additions and 3 deletions

View File

@@ -0,0 +1,32 @@
{
config,
lib,
pkgs,
...
}:
{
options.services.netconsoleReceiver = {
enable = lib.mkEnableOption "netconsole UDP receiver";
port = lib.mkOption {
type = lib.types.port;
default = 6666;
description = "UDP port to listen on for netconsole messages";
};
};
config = lib.mkIf config.services.netconsoleReceiver.enable {
systemd.services.netconsole-receiver = {
description = "Netconsole UDP receiver";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
serviceConfig = {
ExecStart = "${pkgs.socat}/bin/socat -u UDP-LISTEN:${toString config.services.netconsoleReceiver.port},fork STDOUT";
StandardOutput = "journal";
StandardError = "journal";
SyslogIdentifier = "netconsole";
Restart = "always";
RestartSec = "5s";
};
};
};
}

View File

@@ -1,4 +1,4 @@
{ pkgs, inputs, ... }:
{ pkgs, inputs, config, ... }:
{
imports = [
../../common/encrypted-btrfs-layout.nix
@@ -19,6 +19,38 @@
networking.cluster.primaryInterface = "enp1s0";
services.tailscaleAutoconnect.authkey = "tskey-auth-k79UsDTw2v11CNTRL-oYqji35BE9c7CqM89Dzs9cBF14PmqYsi";
# Console blanking after 5 minutes (for greeter display sleep)
# NMI watchdog for hardlockup detection
boot.kernelParams = [ "consoleblank=300" "nmi_watchdog=1" ];
# Netconsole - stream kernel messages to zippy (192.168.1.2)
boot.kernelModules = [ "netconsole" ];
boot.extraModprobeConfig = ''
options netconsole netconsole=@/enp1s0,6666@192.168.1.2/c0:3f:d5:62:55:bb
'';
# Kdump for kernel crash analysis
boot.crashDump = {
enable = true;
reservedMemory = "256M";
};
# Lockup detectors - panic on detection so kdump captures state
boot.kernel.sysctl = {
# Enable all SysRq functions for debugging hangs
boot.kernel.sysctl."kernel.sysrq" = 1;
"kernel.sysrq" = 1;
# Panic on soft lockup (CPU not scheduling for >20s)
"kernel.softlockup_panic" = 1;
# Panic on hung tasks (blocked >120s)
"kernel.hung_task_panic" = 1;
"kernel.hung_task_timeout_secs" = 120;
};
# Persist crash dumps
environment.persistence.${config.custom.impermanence.persistPath}.directories = [
"/var/crash"
];
# Crash analysis tools
environment.systemPackages = with pkgs; [ crash makedumpfile ];
}

View File

@@ -5,9 +5,13 @@
../../common/global
../../common/cluster-member.nix # Consul + storage clients
../../common/nomad-worker.nix # Nomad client (runs jobs)
../../common/netconsole-receiver.nix
./hardware.nix
];
# Receive kernel messages from beefy via netconsole
services.netconsoleReceiver.enable = true;
diskLayout = {
mainDiskDevice = "/dev/disk/by-id/ata-KINGSTON_SKC600MS1024G_50026B7785AE0A92";
#keyDiskDevice = "/dev/disk/by-id/usb-Intenso_Micro_Line_22080777660702-0:0";