-
-
Notifications
You must be signed in to change notification settings - Fork 14.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
swraid/mdcheck: add mdcheck timer for swraid device scrub #204713
Conversation
This pull request has been mentioned on NixOS Discourse. There might be relevant details there: |
Feel free to use anything from here: { config, lib, pkgs, ... }:
let
cfg = config.toupstream.hardware.raid;
inherit (lib) mkEnableOption mkIf mkMerge mkOption optional optionalAttrs types;
inherit (import ../../../common/lib.nix { inherit config lib pkgs; }) mkBoolOption;
diskPath = e:
"/dev/disk/by-${e.idBy}/${e.name}";
mdadmConf = ''
MAILADDR = ${cfg.monitor.recipient}
MAILFROM = no-reply@${config.networking.hostName}
''
+ "DEVICE " + lib.concatMapStringsSep " "
(e: ''${diskPath e}${cfg.partitionSuffix}'')
cfg.drives
+ "\n\n"
+ lib.concatMapStringsSep "\n"
(e: "ARRAY /dev/${e.device} " + lib.concatStringsSep " " (lib.mapAttrsToList (k: v: "${k}=${v}") {
metadata = e.version;
UUID = e.uuid;
name = config.networking.hostName + ":" + e.name;
}))
cfg.arrays;
commonServiceConfig = {
PrivateTmp = true;
ProtectSystem = "strict";
ProtectHome = "tmpfs";
};
enable = builtins.length cfg.drives > 0;
doMonitor = builtins.length cfg.arrays > 0;
arrayPath = array:
"/sys/block/${array.device}/md";
in
{
meta.maintainers = with lib.maintainers; [ peterhoeg ];
options.toupstream.hardware.raid = {
partitionSuffix = mkOption {
description = "Partition suffix";
type = types.str;
default = "-part1";
};
earlyInit = mkOption {
description = "Set up the RAID in initrd to ensure that the RAID is assembled with the proper name";
type = types.bool;
default = true;
};
openFirewall = mkBoolOption "Open the firewall for services that can be read remotely." true;
temperature = {
enable = mkBoolOption "Drive temperatures" true;
preferHddtemp = mkBoolOption "Prefer hddtemp instead of drivetemp" false;
};
optimize = {
enable = mkEnableOption "optimize software RAID";
stripeCache = mkOption {
description = "Device stripe cache size";
type = types.ints.positive;
default = 4096;
};
readAhead = mkOption {
description = "Device read-ahead";
type = types.ints.positive;
default = 65535;
};
speed = mkOption {
description = "Maximum resync/rebuild speed";
type = types.ints.positive;
default = 500000;
};
};
drives = mkOption {
description = "List of drives for software RAID";
default = [ ];
type = types.listOf
(types.submodule {
options = {
name = mkOption {
description = "Drive name without the full path.";
type = types.str;
};
idBy = mkOption {
description = "The method to identify the drive.";
type = types.enum [ "path" "wwn" ];
default = "path";
};
};
});
};
monitor = {
recipient = mkOption {
description = "Notification recipient";
type = types.str;
};
};
scrub = {
enable = mkEnableOption "automatic scrubbing";
startAt = mkOption {
description = "When to scrub";
type = types.str;
default = "weekly";
};
};
arrays = mkOption {
description = "software RAID arrays";
default = [ ];
type = types.listOf
(types.submodule {
options = {
device = mkOption {
description = "Device";
type = types.str;
};
uuid = mkOption {
description = "Device UUID";
type = types.str;
};
name = mkOption {
description = "Device name";
type = types.str;
};
version = mkOption {
description = "Version";
type = types.enum [ "0.9" "1.2" ];
default = "1.2";
};
};
});
};
};
config = mkMerge [
(mkIf enable {
hardware = {
sata.timeout = {
deciSeconds = 70;
drives = map (e: { name = e.name; idBy = "path"; }) cfg.drives;
};
};
systemd.services.mdmonitor.enable = false;
})
(mkIf doMonitor {
boot.initrd.mdadmConf = mkIf cfg.earlyInit mdadmConf;
environment.etc."mdadm/mdadm.conf".text = mdadmConf;
systemd.services.mdadm = {
description = "Monitor RAID";
wantedBy = [ "multi-user.target" ];
serviceConfig = commonServiceConfig // {
Type = "exec";
ExecStart = "${pkgs.mdadm}/bin/mdadm --monitor --scan";
};
};
})
(mkIf cfg.optimize.enable {
boot.kernel.sysctl = {
"dev.raid.speed_limit_max" = cfg.optimize.speed;
};
systemd.services = lib.listToAttrs (map
(e: lib.nameValuePair
"optimize-${e.device}"
{
description = "Optimize software RAID speed";
wantedBy = [ "multi-user.target" ];
unitConfig.ConditionPathExists = [
"/dev/${e.device}"
(arrayPath e)
];
script = ''
###
# /dev/${e.device}
###
# read-ahead
${lib.getBin pkgs.util-linux}/bin/blockdev \
--setra ${toString cfg.optimize.readAhead} \
/dev/${e.device}
# stripe cache size
echo ${toString cfg.optimize.stripeCache} > \
${arrayPath e}/stripe_cache_size
# use all resources for recovery
echo max > \
${arrayPath e}/sync_max
'';
serviceConfig = commonServiceConfig // {
Type = "oneshot";
PrivateNetwork = true;
};
})
cfg.arrays);
})
(mkIf cfg.scrub.enable {
systemd.services = lib.listToAttrs (map
(e: lib.nameValuePair
"scrub-${e.device}"
{
description = "Scrub ${e.device}";
inherit (cfg.scrub) startAt;
unitConfig.ConditionPathExists = [
"/dev/${e.device}"
(arrayPath e)
];
serviceConfig = commonServiceConfig // {
Type = "oneshot";
ExecStart = "${pkgs.runtimeShell} -c 'echo scrub > ${arrayPath e}/sync_action'";
PrivateNetwork = true;
};
})
cfg.arrays);
})
(mkIf cfg.temperature.enable {
boot.kernelModules = optional (!cfg.temperature.preferHddtemp) "drivetemp";
hardware.sensor.hddtemp = {
enable = cfg.temperature.preferHddtemp;
drives = map diskPath cfg.drives;
};
networking.firewall.allowedTCPPorts = [
]
++ optional (cfg.openFirewall && cfg.temperature.preferHddtemp) 7634;
})
];
} |
I'm not a fan of the name It's also probably better to use the upstream service units and patch in the correct paths. You can still do something like |
👍 |
@ElvishJerricco @aanderse Thanks for the feedback! I wasn't aware of the drop-in unit concept -- I've adjusted the implementation to now use the upstream systemd units (patched), and renamed the module to |
This pull request has been mentioned on NixOS Discourse. There might be relevant details there: https://discourse.nixos.org/t/prs-ready-for-review/3032/1544 |
}; | ||
}; | ||
|
||
config = mkIf cfg.enable { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
config = mkIf cfg.enable { | |
config = mkIf cfg.enable { | |
systemd.packages = [ pkgs.mdadm ]; |
Aren't you missing that?
{ | ||
options.services.mdcheck = { | ||
enable = mkEnableOption (lib.mdDoc '' | ||
This option enables mdcheck timers that run regular full scrubs of the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This will be prefixed with Wether to
options.services.mdcheck = { | ||
enable = mkEnableOption (lib.mdDoc '' | ||
This option enables mdcheck timers that run regular full scrubs of the | ||
md devices. These processes can also cause high I/O utilization; the |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
md devices. These processes can also cause high I/O utilization; the | |
md devices. These processes can also cause high I/O utilization; the |
configuration option `boot.kernel.sysctl."dev.raid.speed_limit_max"` | ||
can be used to limit the I/O utilization of the mdcheck processes. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should do a nix option for that
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@SuperSandro2000 Why? It would just be trading one oneliner for another.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To make it discoverable. If we already mention a particular setting in a description than this is a nice addition.
systemd OnCalendar expression for when to start an mdcheck operation. | ||
The default is to begin on the first Sunday of every month at 1am. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should link to the doc of that.
Can we add some random offset?
Also first Sunday seems like an arbitrary choice to me. Why not the 1 first day or 1st Saturday? Maybe we should conver this to an example and let the user choose
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The doc to link to would be the CALENDAR EVENTS
section of systemd.time(7)
.
A random delay can be accomplished with RandomizedDelaySec
, but anyway, I think for consistency's sake, we should probably have this implemented equivalently to the services.zfs.autoScrub
, which defaults to Sun, 02:00
and no random delay. (Though I've just noticed that services.btrfs.autoScrub
has a different default of monthly
, which I find disappointing).
default = "6 hours"; | ||
description = lib.mdDoc '' | ||
When mdcheck starts to execute, this option controls how long should | ||
be spent on the scrub operation before stopping. The default |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
be spent on the scrub operation before stopping. The default | |
be spent on the scrub operation before stopping. The default |
mdcheck operation that was paused after `duration` time passed. The | ||
default is to start at 1:05am every day. If there is no in-progress | ||
operation then nothing will be started. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
mdcheck operation that was paused after `duration` time passed. The | |
default is to start at 1:05am every day. If there is no in-progress | |
operation then nothing will be started. | |
mdcheck operation that was paused after `duration` time passed. | |
The default is to start at 1:05am every day. | |
If there is no in-progress operation then nothing will be started. |
@@ -38,6 +38,17 @@ stdenv.mkDerivation rec { | |||
*.rules | |||
''; | |||
|
|||
postInstall = '' | |||
install -D -m 755 misc/mdcheck ''${out}/bin/mdcheck |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
install -D -m 755 misc/mdcheck ''${out}/bin/mdcheck | |
install -D -m 755 misc/mdcheck $out/bin/mdcheck |
--prefix PATH : ''${out}/bin:${lib.makeBinPath [ coreutils util-linux findutils gnugrep ]} | ||
|
||
sed -i \ | ||
-e "s@/usr/share/mdadm/mdcheck@''${out}/bin/mdcheck@" \ | ||
-e "s@ExecStartPre=-/usr/lib/mdadm/mdadm_env.sh@@" \ | ||
''${out}/lib/systemd/system/mdcheck* ''${out}/lib/systemd/system/mdmonitor* |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
--prefix PATH : ''${out}/bin:${lib.makeBinPath [ coreutils util-linux findutils gnugrep ]} | |
sed -i \ | |
-e "s@/usr/share/mdadm/mdcheck@''${out}/bin/mdcheck@" \ | |
-e "s@ExecStartPre=-/usr/lib/mdadm/mdadm_env.sh@@" \ | |
''${out}/lib/systemd/system/mdcheck* ''${out}/lib/systemd/system/mdmonitor* | |
--prefix PATH : $out/bin:${lib.makeBinPath [ coreutils util-linux findutils gnugrep ]} | |
sed -i \ | |
-e "s@/usr/share/mdadm/mdcheck@$out/bin/mdcheck@" \ | |
-e "s@ExecStartPre=-/usr/lib/mdadm/mdadm_env.sh@@" \ | |
$out/lib/systemd/system/mdcheck* $out/lib/systemd/system/mdmonitor* |
If the module adds the mdadm package globally we don't need to prefix $out/bin
This has been sitting here for a while and I just recently was in the weeds with #123466 and #254429. After reviewing the upstream units carefully I strongly vote against getting rid of them. The upstream udev rules and systemd units are intricately intertwined and rebuilding them on the NixOS side will quickly turn into an unsustainable hot mess. Looking at the unreleased changes upstream. If the check timer really doesn't activate (I didn't check that, I only worked to fix the monitor service in those previous PRs), then I'm all for using the drop-in unit approach and keeping the changeset as minimal as possible but would strongly recommend to extend the test. I've broken master yesterday which warranted a further extension of the tests and as this is storage, we need to be careful not having to say "sorry" to our users by screwing up currently working systems. |
@ctheune I believe the PR is currently intended to use the upstream units; it's just currently missing the line |
This is way worse for discoverability than anything that has |
I think this PR isn't needed anymore anyway. We should double check that the original upstream modules really do start the timer, though. |
See #332096, TL;DR: the systemd services installed from the upstream are completely non-functional as they hard-code a path to a script in |
Dang. I'd love for the situation to improve in general but I'm a bit puzzled what a good strategy is to not end up in maintenance hell whenever upstream decides to fiddle around in this (excuse me) rat's nest of udev rules and scripts ... Ideas? Happy to talk on Element about this. |
Well, the good news is automatic scrubbing is just two systemd timer units and one bash script. Nothing complicated or particularly convoluted there. The straightforward approach is to install the bash script to the nix store and patch systemd services with the correct path. Or we could reimplement those units on the Nix side, they're extremely simple. Both are viable IMO, with different tradeoffs. Personally, I'm leaning to the former, as if something changes, it'll be easier to catch at build time. |
Yeah, definitely the former IMHO. I wonder whether I should go through all of those udev rules once to see what's going on there and whether patching/scrubbing has even a chance of getting it right for a long-ish period of time. |
They're not involved with the issue at hand (#332096). At all. That being said, there are some references to non-existent systemd units in those rules, but that's an entirely separate concern. |
Yeah, sorry for the tangent. I've touched that part before and it's really fragile, so I'm looking at the bigger picture. I'd support your suggested change for this specific issue. |
I'm no longer using mdraid, and won't complete any outstanding work on this PR. If someone wants to adopt this change in the future and complete it, please feel free to use the code in this PR without restriction. Closing. |
Description of changes
Adds new options
hardware.raid.swraid.monitor
which, when enabled, ports a systemd timer from the mdadm package to NixOS that runs regularly scheduled scrubs on swraid md* devices.To support the new NixOS module, this PR adjusts the mdadm package to: (a) package the mdcheck script, (b) remove the upstream systemd units (mdstart_, mdmonitor) that are not functional in NixOS.
Questions/uncertainties with this (raised in https://discourse.nixos.org/t/nixos-mdadm-mdcheck/23716):
I’ve named the configuration added hardware.raid.swraid.monitor – is this a good naming scheme?
enable
defaults to false – this maintains compatibility with existing NixOS systems, but, has some risks of being an unsafe configuration since regular raid scrubbing is a somewhat standard safety practice. Should this be true, false, or maybe true-if-stateVersion>="23.05" so that it becomes the default on new systems?I removed some existing systemd units that are packaged by the mdadm package, but aren’t usable because they don’t subst in the right mdcheck/mdadm paths, and instead recreated the units via systemd.timers and systemd.services to allow configurability. Is this a good direction, or, would it be better to use the upstream unit files and tweak them?
This doesn’t run the mdmonitor systemd service which has a couple of functionalities; (a) it emails or runs a program on a device event, (b) it swaps spare devices between raid arrays if appropriate on the event of a failure. I didn’t want to include it because getting the functionality (b) requires enabling the functionality (a), which would require explicit configuration for “what to do”. Is leaving this out pretty reasonable? It could be enabled pretty easily by another module, or enhanced in the future.
Things done
sandbox = true
set innix.conf
? (See Nix manual)nix-shell -p nixpkgs-review --run "nixpkgs-review rev HEAD"
. Note: all changes have to be committed, also see nixpkgs-review usage./result/bin/
)nixos/doc/manual/md-to-db.sh
to update generated release notes