NixOS · nh2 · Jan 27, 2018 · May 6, 2018 · May 8, 2018 · coretemp
diff --git a/nix/hetzner.nix b/nix/hetzner.nix
@@ -90,12 +90,181 @@ with lib;
  part swap --recommended --label=swap --fstype=swap --ondisk=vda
  part / --fstype=ext4 --label=root --grow --ondisk=vda
  '';
- type = types.lines;
+ type = types.nullOr types.lines;
  description = ''
  Specify layout of partitions and file systems using Anacondas Kickstart
  format. For possible options and commands, please have a look at:
 
  <link xlink:href="http://fedoraproject.org/wiki/Anaconda/Kickstart"/>
+
+ If Kickstart is not sufficient for your partitioning needs,
+ consider the <option>partitioningScript</option> option instead.
+
+ The <option>partitions</option> and <option>partitioningScript</option>
+ options are mutually exclusive.
+ '';
+ };
+
+ partitioningScript = mkOption {
+ type = types.nullOr types.lines;
+ default = null;
+ example = ''
+ # Example custom partitioningScript
+ # that creates an ext4 with external journal, across two RAID1s
+ # (one on HDDs, one on SSDs).
+
+ set -x
+ set -euo pipefail
+
+ # If the SSD doesn't support the RZAT (Return Zero After Trim) feature,
+ # we can't do the `lazy_journal_init=1` journal creation speedup
+ # below, so fail early in that case.
+ # Note that as per
+ # https://github.com/torvalds/linux/blob/e64f638483a21105c7ce330d543fa1f1c35b5bc7/drivers/ata/libata-core.c#L4242-L4250
+ # TRIM in general is optional and thus this would be unsafe,
+ # but the kernel announces RZAT only for a whitelist of devices
+ # that are known to execute TRIM when requested.
+ #
+ # Note that this is probably not needed when the ext4 journal is on top
+ # of an mdadm RAID (because that one likely guarantees to read zeros from
+ # freshly initialised RAID arrays, but I haven't checked that this really
+ # works), but we keep it in here just in case it doesn't work or
+ # somebody wants to run the journal NOT on top of a RAID.
+ #
+ # TODO Fall back to slow `lazy_journal_init=1` if RZAT isn't supported.
+ if hdparm -I /dev/sda | grep -i 'Deterministic read ZEROs after TRIM'; then echo "RZAT supported, can use lazy_journal_init=1 safely"; else echo "RZAT not supported on /dev/sda, cannot use lazy_journal_init=1 safely, exiting" 1>&2; exit 1; fi
+ if hdparm -I /dev/sdb | grep -i 'Deterministic read ZEROs after TRIM'; then echo "RZAT supported, can use lazy_journal_init=1 safely"; else echo "RZAT not supported on /dev/sdb, cannot use lazy_journal_init=1 safely, exiting" 1>&2; exit 1; fi
+
+ # Stop RAID devices if running, otherwise we can't modify the disks below.
+ test -b /dev/md0 && mdadm --stop /dev/md0
+ test -b /dev/md1 && mdadm --stop /dev/md1
+
+ # Zero out SSDs with TRIM command, so that `lazy_journal_init=1` can be safely used below.
+ blkdiscard /dev/sda
+ blkdiscard /dev/sdb
+
+ # Create BIOS boot partition and main partition for each SSD and HDD.
+ # Note Hetzner does use BIOS, not UEFI.
+ # We use GPT because these disks could be too large for MSDOS partitions (e.g. 10TB disks).
+ parted --script -a optimal /dev/sda -- mklabel gpt mkpart primary 1MiB 2MiB set 1 bios_grub on mkpart primary 2MiB '100%'
+ parted --script -a optimal /dev/sdb -- mklabel gpt mkpart primary 1MiB 2MiB set 1 bios_grub on mkpart primary 2MiB '100%'
+ parted --script -a optimal /dev/sdc -- mklabel gpt mkpart primary 1MiB 2MiB set 1 bios_grub on mkpart primary 2MiB '100%'
+ parted --script -a optimal /dev/sdd -- mklabel gpt mkpart primary 1MiB 2MiB set 1 bios_grub on mkpart primary 2MiB '100%'
+
+ # Now /dev/sd*1 is the BIOS boot partition, /dev/sd*2 is the one data partition
+
+ # Reload partition table so Linux can see the changes
+ partprobe
+
+ # Wait for all devices to exist
+ udevadm settle --timeout=5 --exit-if-exists=/dev/sda1
+ udevadm settle --timeout=5 --exit-if-exists=/dev/sda2
+ udevadm settle --timeout=5 --exit-if-exists=/dev/sdb1
+ udevadm settle --timeout=5 --exit-if-exists=/dev/sdb2
+ udevadm settle --timeout=5 --exit-if-exists=/dev/sdc1
+ udevadm settle --timeout=5 --exit-if-exists=/dev/sdc2
+ udevadm settle --timeout=5 --exit-if-exists=/dev/sdd1
+ udevadm settle --timeout=5 --exit-if-exists=/dev/sdd2
+
+ # --run makes mdadm not prompt the user for confirmation
+ mdadm --create --run --verbose /dev/md0 --level=1 --raid-devices=2 /dev/sda2 /dev/sdb2
+ mdadm --create --run --verbose /dev/md1 --level=1 --raid-devices=2 /dev/sdc2 /dev/sdd2
+
+ # Wipe filesystem signatures that might be on the RAID from some
+ # possibly existing older use of the disks.
+ # It's not clear to me *why* it is needed, but I have certainly
+ # observed that it is needed because ext4 labels magically survive
+ # mdadm RAID re-creations.
+ # See
+ # https://serverfault.com/questions/911370/why-does-mdadm-zero-superblock-preserve-file-system-information
+ wipefs -a /dev/md0
+ wipefs -a /dev/md1
+
+ # Disable RAID recovery. We don't want this to slow down machine provisioning
+ # in the Hetzner rescue mode. It can run in normal operation after reboot.
+ echo 0 > /proc/sys/dev/raid/speed_limit_max
+
+ # `lazy_journal_init=1` to not have to zero the device;
+ # we use ATA TRIM with RZAT support to guarantee the device
+ # is already zeroed; see comment further up about the safety of that.
+ mke2fs -F -L rootjournal -O journal_dev -E lazy_journal_init=1 /dev/md0
+ mkfs.ext4 -F -L root -J device=/dev/md0 /dev/md1
+ '';
+ description = ''
+ Script to run after booting into the Hetzner rescue mode
+ to manually create partitions.
+
+ Note as of writing, Hetzner uses BIOS, not UEFI, so if you want
+ to use GPT partition tables (which you need in case you want to
+ make partitions larger than 2 TiB) you will likely have to make
+ a BIOS boot partition
+ (<link xlink:href="http://fedoraproject.org/wiki/Anaconda/Kickstart"/>).
+
+ Where possible, use the simpler <option>partitions</option> option instead of this option.
+
+ The <option>partitions</option> and <option>partitioningScript</option>
+ options are mutually exclusive.
+
+ If you use this option, you must set "partitions = null",
+ you must set "filesystemInfo" to an accurate representation
+ of the partitions your script creates,
+ and you must set "mountScript" to mount the created target
+ root partition at /mnt.
+ '';
+ };
+
+ mountScript = mkOption {
+ type = types.nullOr types.lines;
+ default = null;
+ example = ''
+ # Example mountScript matching the example for partitioningScript,
+ # that creates an ext4 with external journal, across two RAID1s
+ # (one on HDDs, one on SSDs).
+
+ set -e
+ mount -o data=journal /dev/md1 /mnt
+ '';
+ description = ''
+ Script to run after booting into the Hetzner rescue mode,
+ and after formatting, to mount the root filesystem at /mnt.
+
+ This option is required when "partitioningScript" is used.
+ '';
+ };
+
+ filesystemInfo = mkOption {
+ type = types.nullOr types.attrs;
+ default = null;
+ example = literalExample ''
+ {
+ # Example filesystemInfo matching the example for partitioningScript,
+ # that creates an ext4 with external journal, across two RAID1s
+ # (one on HDDs, one on SSDs).
+ swapDevices = [];
+ boot.loader.grub.devices = [
+ "/dev/sda"
+ "/dev/sdb"
+ "/dev/sdc"
+ "/dev/sdd"
+ ];
+ fileSystems = {
+ "/" = {
+ fsType = "ext4";
+ label = "root";
+ options = [
+ "journal_path=/dev/disk/by-label/rootjournal"
+ "data=journal"
+ "errors=remount-ro"
+ ];
+ };
+ };
+ }
+ '';
+ description = ''
+ Override the filesystem info obtained from the machine after partitioning.
+
+ This option is required when "partitioningScript" is used, but can also
+ be set if the filesystem info obtained via <option>partitions</option> is not what you need.
  '';
  };
  };

diff --git a/nixops/backends/__init__.py b/nixops/backends/__init__.py
@@ -3,6 +3,7 @@
 import os
 import re
 import subprocess
+import time
 
 import nixops.util
 import nixops.resources
@@ -172,7 +173,7 @@ def backup(self, defn, backup_id):
  """Make backup of persistent disks, if possible."""
  self.warn("don't know how to make backup of disks for machine ‘{0}’".format(self.name))
 
- def reboot(self, hard=False):
+ def reboot(self, hard=False, reset=True):
  """Reboot this machine."""
  self.log("rebooting...")
  if self.state == self.RESCUE:
@@ -184,16 +185,46 @@ def reboot(self, hard=False):
  reboot_command = "systemctl reboot"
  self.run_command(reboot_command, check=False)
  self.state = self.STARTING
- self.ssh.reset()
+ if reset:
+ self.ssh.reset()
 
  def reboot_sync(self, hard=False):
  """Reboot this machine and wait until it's up again."""
- self.reboot(hard=hard)
- self.log_start("waiting for the machine to finish rebooting...")
- nixops.util.wait_for_tcp_port(self.get_ssh_name(), self.ssh_port, open=False, callback=lambda: self.log_continue("."))
- self.log_continue("[down]")
- nixops.util.wait_for_tcp_port(self.get_ssh_name(), self.ssh_port, callback=lambda: self.log_continue("."))
- self.log_end("[up]")
+
+ # To check when the machine has finished rebooting in a race-free
+ # manner, we compare the output of `last reboot` before and after
+ # the reboot. Once the output has changed, the reboot is done.
+ def get_last_reboot_output():
+ # Note `last reboot` does not exist on older OSs like
+ # the Hetzner rescue system, but that doesn't matter much
+ # because all we care about is when the output of the
+ # command invocation changes.
+ # We use timeout=10 so that the user gets some sense
+ # of progress, as reboots can take a long time.
+ return self.run_command('last reboot --time-format iso | head -n1', capture_stdout=True, timeout=10).rstrip()
+
+ pre_reboot_last_reboot_output = get_last_reboot_output()
+
+ # We set reset=False so that we can continue running `last reboot`
+ # remotely; when the reboot happens, our SSH connection will be reset
+ # by the remote side instead.
+ self.reboot(hard=hard, reset=False)
+
+ self.log_start("waiting for reboot to complete...")
+ while True:
+ last_reboot_output = None
+ try:
+ last_reboot_output = get_last_reboot_output()
+ except (nixops.ssh_util.SSHConnectionFailed, nixops.ssh_util.SSHCommandFailed):
+ # We accept this because the machine might be down,
+ # and show an 'x' as progress indicator in that case.
+ self.log_continue("x")
+ if last_reboot_output is not None and last_reboot_output != pre_reboot_last_reboot_output:
+ break
+ self.log_continue(".")
+ time.sleep(1)
+ self.log_end("done.")
+
  self.state = self.UP
  self.ssh_pinged = True
  self._ssh_pinged_this_time = True

diff --git a/nixops/backends/azure_vm.py b/nixops/backends/azure_vm.py
@@ -948,14 +948,15 @@ def after_activation(self, defn):
  self._delete_encryption_key(d_id)
 
 
- def reboot(self, hard=False):
+ def reboot(self, hard=False, reset=True):
  if hard:
  self.log("sending hard reset to Azure machine...")
  self.cmc().virtual_machines.restart(self.resource_group, self.machine_name)
  self.state = self.STARTING
- self.ssh.reset()
+ if reset:
+ self.ssh.reset()
  else:
- MachineState.reboot(self, hard=hard)
+ MachineState.reboot(self, hard=hard, reset=reset)
  self.ssh_pinged = False
 
  def start(self):

diff --git a/nixops/backends/ec2.py b/nixops/backends/ec2.py
@@ -1443,7 +1443,7 @@ def _check(self, res):
  res.messages.append(" * {0} - {1}".format(e.not_before, e.not_after))
 
 
- def reboot(self, hard=False):
+ def reboot(self, hard=False, reset=True):
  self.log("rebooting EC2 machine...")
  instance = self._get_instance()
  instance.reboot()

diff --git a/nixops/backends/gce.py b/nixops/backends/gce.py
@@ -557,13 +557,13 @@ def create_node(self, defn):
  self.on_host_maintenance = defn.on_host_maintenance
 
 
- def reboot(self, hard=False):
+ def reboot(self, hard=False, reset=True):
  if hard:
  self.log("sending hard reset to GCE machine...")
  self.node().reboot()
  self.state = self.STARTING
  else:
- MachineState.reboot(self, hard=hard)
+ MachineState.reboot(self, hard=hard, reset=reset)
 
 
  def start(self):