Files
alo-cluster/hosts/beefy/default.nix
Petru Paler 58c851004d Add stability debugging for beefy lockups.
- Add netconsole receiver on zippy to capture kernel messages
- Configure beefy as netconsole sender to zippy (192.168.1.2)
- Enable kdump with 256M reserved memory for crash analysis
- Add lockup detectors (softlockup_panic, hung_task_panic, nmi_watchdog)
- Add consoleblank=300 for greeter display sleep
- Persist crash dumps and add analysis tools (crash, makedumpfile)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-12 07:23:33 +00:00

57 lines
1.8 KiB
Nix

{ pkgs, inputs, config, ... }:
{
imports = [
../../common/encrypted-btrfs-layout.nix
../../common/global
../../common/desktop-node.nix # Hyprland + GUI environment
../../common/cluster-member.nix # Consul + storage clients
../../common/cluster-tools.nix # Nomad CLI (no service)
./hardware.nix
];
diskLayout = {
mainDiskDevice = "/dev/disk/by-id/nvme-CT1000P3PSSD8_25164F81F31D";
#keyDiskDevice = "/dev/disk/by-id/usb-Intenso_Micro_Line_22080777650797-0:0";
keyDiskDevice = "/dev/sda";
};
networking.hostName = "beefy";
networking.cluster.primaryInterface = "enp1s0";
services.tailscaleAutoconnect.authkey = "tskey-auth-k79UsDTw2v11CNTRL-oYqji35BE9c7CqM89Dzs9cBF14PmqYsi";
# Console blanking after 5 minutes (for greeter display sleep)
# NMI watchdog for hardlockup detection
boot.kernelParams = [ "consoleblank=300" "nmi_watchdog=1" ];
# Netconsole - stream kernel messages to zippy (192.168.1.2)
boot.kernelModules = [ "netconsole" ];
boot.extraModprobeConfig = ''
options netconsole netconsole=@/enp1s0,6666@192.168.1.2/c0:3f:d5:62:55:bb
'';
# Kdump for kernel crash analysis
boot.crashDump = {
enable = true;
reservedMemory = "256M";
};
# Lockup detectors - panic on detection so kdump captures state
boot.kernel.sysctl = {
# Enable all SysRq functions for debugging hangs
"kernel.sysrq" = 1;
# Panic on soft lockup (CPU not scheduling for >20s)
"kernel.softlockup_panic" = 1;
# Panic on hung tasks (blocked >120s)
"kernel.hung_task_panic" = 1;
"kernel.hung_task_timeout_secs" = 120;
};
# Persist crash dumps
environment.persistence.${config.custom.impermanence.persistPath}.directories = [
"/var/crash"
];
# Crash analysis tools
environment.systemPackages = with pkgs; [ crash makedumpfile ];
}