Add stability debugging for beefy lockups.
- Add netconsole receiver on zippy to capture kernel messages - Configure beefy as netconsole sender to zippy (192.168.1.2) - Enable kdump with 256M reserved memory for crash analysis - Add lockup detectors (softlockup_panic, hung_task_panic, nmi_watchdog) - Add consoleblank=300 for greeter display sleep - Persist crash dumps and add analysis tools (crash, makedumpfile) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
32
common/netconsole-receiver.nix
Normal file
32
common/netconsole-receiver.nix
Normal file
@@ -0,0 +1,32 @@
|
||||
{
|
||||
config,
|
||||
lib,
|
||||
pkgs,
|
||||
...
|
||||
}:
|
||||
{
|
||||
options.services.netconsoleReceiver = {
|
||||
enable = lib.mkEnableOption "netconsole UDP receiver";
|
||||
port = lib.mkOption {
|
||||
type = lib.types.port;
|
||||
default = 6666;
|
||||
description = "UDP port to listen on for netconsole messages";
|
||||
};
|
||||
};
|
||||
|
||||
config = lib.mkIf config.services.netconsoleReceiver.enable {
|
||||
systemd.services.netconsole-receiver = {
|
||||
description = "Netconsole UDP receiver";
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
after = [ "network.target" ];
|
||||
serviceConfig = {
|
||||
ExecStart = "${pkgs.socat}/bin/socat -u UDP-LISTEN:${toString config.services.netconsoleReceiver.port},fork STDOUT";
|
||||
StandardOutput = "journal";
|
||||
StandardError = "journal";
|
||||
SyslogIdentifier = "netconsole";
|
||||
Restart = "always";
|
||||
RestartSec = "5s";
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
||||
@@ -1,4 +1,4 @@
|
||||
{ pkgs, inputs, ... }:
|
||||
{ pkgs, inputs, config, ... }:
|
||||
{
|
||||
imports = [
|
||||
../../common/encrypted-btrfs-layout.nix
|
||||
@@ -19,6 +19,38 @@
|
||||
networking.cluster.primaryInterface = "enp1s0";
|
||||
services.tailscaleAutoconnect.authkey = "tskey-auth-k79UsDTw2v11CNTRL-oYqji35BE9c7CqM89Dzs9cBF14PmqYsi";
|
||||
|
||||
# Console blanking after 5 minutes (for greeter display sleep)
|
||||
# NMI watchdog for hardlockup detection
|
||||
boot.kernelParams = [ "consoleblank=300" "nmi_watchdog=1" ];
|
||||
|
||||
# Netconsole - stream kernel messages to zippy (192.168.1.2)
|
||||
boot.kernelModules = [ "netconsole" ];
|
||||
boot.extraModprobeConfig = ''
|
||||
options netconsole netconsole=@/enp1s0,6666@192.168.1.2/c0:3f:d5:62:55:bb
|
||||
'';
|
||||
|
||||
# Kdump for kernel crash analysis
|
||||
boot.crashDump = {
|
||||
enable = true;
|
||||
reservedMemory = "256M";
|
||||
};
|
||||
|
||||
# Lockup detectors - panic on detection so kdump captures state
|
||||
boot.kernel.sysctl = {
|
||||
# Enable all SysRq functions for debugging hangs
|
||||
boot.kernel.sysctl."kernel.sysrq" = 1;
|
||||
"kernel.sysrq" = 1;
|
||||
# Panic on soft lockup (CPU not scheduling for >20s)
|
||||
"kernel.softlockup_panic" = 1;
|
||||
# Panic on hung tasks (blocked >120s)
|
||||
"kernel.hung_task_panic" = 1;
|
||||
"kernel.hung_task_timeout_secs" = 120;
|
||||
};
|
||||
|
||||
# Persist crash dumps
|
||||
environment.persistence.${config.custom.impermanence.persistPath}.directories = [
|
||||
"/var/crash"
|
||||
];
|
||||
|
||||
# Crash analysis tools
|
||||
environment.systemPackages = with pkgs; [ crash makedumpfile ];
|
||||
}
|
||||
|
||||
@@ -5,9 +5,13 @@
|
||||
../../common/global
|
||||
../../common/cluster-member.nix # Consul + storage clients
|
||||
../../common/nomad-worker.nix # Nomad client (runs jobs)
|
||||
../../common/netconsole-receiver.nix
|
||||
./hardware.nix
|
||||
];
|
||||
|
||||
# Receive kernel messages from beefy via netconsole
|
||||
services.netconsoleReceiver.enable = true;
|
||||
|
||||
diskLayout = {
|
||||
mainDiskDevice = "/dev/disk/by-id/ata-KINGSTON_SKC600MS1024G_50026B7785AE0A92";
|
||||
#keyDiskDevice = "/dev/disk/by-id/usb-Intenso_Micro_Line_22080777660702-0:0";
|
||||
|
||||
Reference in New Issue
Block a user