Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hetzner partitioning script #948

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 170 additions & 1 deletion nix/hetzner.nix
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,181 @@ with lib;
part swap --recommended --label=swap --fstype=swap --ondisk=vda
part / --fstype=ext4 --label=root --grow --ondisk=vda
'';
type = types.lines;
type = types.nullOr types.lines;
description = ''
Specify layout of partitions and file systems using Anacondas Kickstart
format. For possible options and commands, please have a look at:

<link xlink:href="http://fedoraproject.org/wiki/Anaconda/Kickstart"/>

If Kickstart is not sufficient for your partitioning needs,
consider the <option>partitioningScript</option> option instead.

The <option>partitions</option> and <option>partitioningScript</option>
options are mutually exclusive.
'';
};

partitioningScript = mkOption {
type = types.nullOr types.lines;
default = null;
example = ''
# Example custom partitioningScript
# that creates an ext4 with external journal, across two RAID1s
# (one on HDDs, one on SSDs).

set -x
set -euo pipefail

# If the SSD doesn't support the RZAT (Return Zero After Trim) feature,
# we can't do the `lazy_journal_init=1` journal creation speedup
# below, so fail early in that case.
# Note that as per
# https://github.com/torvalds/linux/blob/e64f638483a21105c7ce330d543fa1f1c35b5bc7/drivers/ata/libata-core.c#L4242-L4250
# TRIM in general is optional and thus this would be unsafe,
# but the kernel announces RZAT only for a whitelist of devices
# that are known to execute TRIM when requested.
#
# Note that this is probably not needed when the ext4 journal is on top
# of an mdadm RAID (because that one likely guarantees to read zeros from
# freshly initialised RAID arrays, but I haven't checked that this really
# works), but we keep it in here just in case it doesn't work or
# somebody wants to run the journal NOT on top of a RAID.
#
# TODO Fall back to slow `lazy_journal_init=1` if RZAT isn't supported.
if hdparm -I /dev/sda | grep -i 'Deterministic read ZEROs after TRIM'; then echo "RZAT supported, can use lazy_journal_init=1 safely"; else echo "RZAT not supported on /dev/sda, cannot use lazy_journal_init=1 safely, exiting" 1>&2; exit 1; fi
if hdparm -I /dev/sdb | grep -i 'Deterministic read ZEROs after TRIM'; then echo "RZAT supported, can use lazy_journal_init=1 safely"; else echo "RZAT not supported on /dev/sdb, cannot use lazy_journal_init=1 safely, exiting" 1>&2; exit 1; fi

# Stop RAID devices if running, otherwise we can't modify the disks below.
test -b /dev/md0 && mdadm --stop /dev/md0
test -b /dev/md1 && mdadm --stop /dev/md1

# Zero out SSDs with TRIM command, so that `lazy_journal_init=1` can be safely used below.
blkdiscard /dev/sda
blkdiscard /dev/sdb

# Create BIOS boot partition and main partition for each SSD and HDD.
# Note Hetzner does use BIOS, not UEFI.
# We use GPT because these disks could be too large for MSDOS partitions (e.g. 10TB disks).
parted --script -a optimal /dev/sda -- mklabel gpt mkpart primary 1MiB 2MiB set 1 bios_grub on mkpart primary 2MiB '100%'
parted --script -a optimal /dev/sdb -- mklabel gpt mkpart primary 1MiB 2MiB set 1 bios_grub on mkpart primary 2MiB '100%'
parted --script -a optimal /dev/sdc -- mklabel gpt mkpart primary 1MiB 2MiB set 1 bios_grub on mkpart primary 2MiB '100%'
parted --script -a optimal /dev/sdd -- mklabel gpt mkpart primary 1MiB 2MiB set 1 bios_grub on mkpart primary 2MiB '100%'

# Now /dev/sd*1 is the BIOS boot partition, /dev/sd*2 is the one data partition

# Reload partition table so Linux can see the changes
partprobe

# Wait for all devices to exist
udevadm settle --timeout=5 --exit-if-exists=/dev/sda1
udevadm settle --timeout=5 --exit-if-exists=/dev/sda2
udevadm settle --timeout=5 --exit-if-exists=/dev/sdb1
udevadm settle --timeout=5 --exit-if-exists=/dev/sdb2
udevadm settle --timeout=5 --exit-if-exists=/dev/sdc1
udevadm settle --timeout=5 --exit-if-exists=/dev/sdc2
udevadm settle --timeout=5 --exit-if-exists=/dev/sdd1
udevadm settle --timeout=5 --exit-if-exists=/dev/sdd2

# --run makes mdadm not prompt the user for confirmation
mdadm --create --run --verbose /dev/md0 --level=1 --raid-devices=2 /dev/sda2 /dev/sdb2
mdadm --create --run --verbose /dev/md1 --level=1 --raid-devices=2 /dev/sdc2 /dev/sdd2

# Wipe filesystem signatures that might be on the RAID from some
# possibly existing older use of the disks.
# It's not clear to me *why* it is needed, but I have certainly
# observed that it is needed because ext4 labels magically survive
# mdadm RAID re-creations.
# See
# https://serverfault.com/questions/911370/why-does-mdadm-zero-superblock-preserve-file-system-information
wipefs -a /dev/md0
wipefs -a /dev/md1

# Disable RAID recovery. We don't want this to slow down machine provisioning
# in the Hetzner rescue mode. It can run in normal operation after reboot.
echo 0 > /proc/sys/dev/raid/speed_limit_max

# `lazy_journal_init=1` to not have to zero the device;
# we use ATA TRIM with RZAT support to guarantee the device
# is already zeroed; see comment further up about the safety of that.
mke2fs -F -L rootjournal -O journal_dev -E lazy_journal_init=1 /dev/md0
mkfs.ext4 -F -L root -J device=/dev/md0 /dev/md1
'';
description = ''
Script to run after booting into the Hetzner rescue mode
to manually create partitions.

Note as of writing, Hetzner uses BIOS, not UEFI, so if you want
to use GPT partition tables (which you need in case you want to
make partitions larger than 2 TiB) you will likely have to make
a BIOS boot partition
(<link xlink:href="http://fedoraproject.org/wiki/Anaconda/Kickstart"/>).

Where possible, use the simpler <option>partitions</option> option instead of this option.

The <option>partitions</option> and <option>partitioningScript</option>
options are mutually exclusive.

If you use this option, you must set "partitions = null",
you must set "filesystemInfo" to an accurate representation
of the partitions your script creates,
and you must set "mountScript" to mount the created target
root partition at /mnt.
'';
};

mountScript = mkOption {
type = types.nullOr types.lines;
default = null;
example = ''
# Example mountScript matching the example for partitioningScript,
# that creates an ext4 with external journal, across two RAID1s
# (one on HDDs, one on SSDs).

set -e
mount -o data=journal /dev/md1 /mnt
'';
description = ''
Script to run after booting into the Hetzner rescue mode,
and after formatting, to mount the root filesystem at /mnt.

This option is required when "partitioningScript" is used.
'';
};

filesystemInfo = mkOption {
type = types.nullOr types.attrs;
default = null;
example = literalExample ''
{
# Example filesystemInfo matching the example for partitioningScript,
# that creates an ext4 with external journal, across two RAID1s
# (one on HDDs, one on SSDs).
swapDevices = [];
boot.loader.grub.devices = [
"/dev/sda"
"/dev/sdb"
"/dev/sdc"
"/dev/sdd"
];
fileSystems = {
"/" = {
fsType = "ext4";
label = "root";
options = [
"journal_path=/dev/disk/by-label/rootjournal"
"data=journal"
"errors=remount-ro"
];
};
};
}
'';
description = ''
Override the filesystem info obtained from the machine after partitioning.

This option is required when "partitioningScript" is used, but can also
be set if the filesystem info obtained via <option>partitions</option> is not what you need.
'';
};
};
Expand Down
47 changes: 39 additions & 8 deletions nixops/backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import re
import subprocess
import time

import nixops.util
import nixops.resources
Expand Down Expand Up @@ -172,7 +173,7 @@ def backup(self, defn, backup_id):
"""Make backup of persistent disks, if possible."""
self.warn("don't know how to make backup of disks for machine ‘{0}’".format(self.name))

def reboot(self, hard=False):
def reboot(self, hard=False, reset=True):
"""Reboot this machine."""
self.log("rebooting...")
if self.state == self.RESCUE:
Expand All @@ -184,16 +185,46 @@ def reboot(self, hard=False):
reboot_command = "systemctl reboot"
self.run_command(reboot_command, check=False)
self.state = self.STARTING
self.ssh.reset()
if reset:
self.ssh.reset()

def reboot_sync(self, hard=False):
"""Reboot this machine and wait until it's up again."""
self.reboot(hard=hard)
self.log_start("waiting for the machine to finish rebooting...")
nixops.util.wait_for_tcp_port(self.get_ssh_name(), self.ssh_port, open=False, callback=lambda: self.log_continue("."))
self.log_continue("[down]")
nixops.util.wait_for_tcp_port(self.get_ssh_name(), self.ssh_port, callback=lambda: self.log_continue("."))
self.log_end("[up]")

# To check when the machine has finished rebooting in a race-free
# manner, we compare the output of `last reboot` before and after
# the reboot. Once the output has changed, the reboot is done.
def get_last_reboot_output():
# Note `last reboot` does not exist on older OSs like
# the Hetzner rescue system, but that doesn't matter much
# because all we care about is when the output of the
# command invocation changes.
# We use timeout=10 so that the user gets some sense
# of progress, as reboots can take a long time.
return self.run_command('last reboot --time-format iso | head -n1', capture_stdout=True, timeout=10).rstrip()

pre_reboot_last_reboot_output = get_last_reboot_output()

# We set reset=False so that we can continue running `last reboot`
# remotely; when the reboot happens, our SSH connection will be reset
# by the remote side instead.
self.reboot(hard=hard, reset=False)

self.log_start("waiting for reboot to complete...")
while True:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't look robust, because perhaps Hetzner did something wrong and it doesn't actually ever come up. I see lots of people who write while True: loops, but in reality, if it hasn't started after 3 minutes, you will start to wonder what's going on and you would have to manual work.

As such, this level of automation is an improvement over having nothing, but there is room for improvement.

last_reboot_output = None
try:
last_reboot_output = get_last_reboot_output()
except (nixops.ssh_util.SSHConnectionFailed, nixops.ssh_util.SSHCommandFailed):
# We accept this because the machine might be down,
# and show an 'x' as progress indicator in that case.
self.log_continue("x")
if last_reboot_output is not None and last_reboot_output != pre_reboot_last_reboot_output:
break
self.log_continue(".")
time.sleep(1)
self.log_end("done.")
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In a log it would be more useful see a line that says what actually was done.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@coretemp: This is done in log_start, which results in something like waiting for reboot to complete.....xxxx....done.. However I guess we could also refactor the logging mechanism to allow for context managers so this becomes more clear.


self.state = self.UP
self.ssh_pinged = True
self._ssh_pinged_this_time = True
Expand Down
7 changes: 4 additions & 3 deletions nixops/backends/azure_vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -948,14 +948,15 @@ def after_activation(self, defn):
self._delete_encryption_key(d_id)


def reboot(self, hard=False):
def reboot(self, hard=False, reset=True):
if hard:
self.log("sending hard reset to Azure machine...")
self.cmc().virtual_machines.restart(self.resource_group, self.machine_name)
self.state = self.STARTING
self.ssh.reset()
if reset:
self.ssh.reset()
else:
MachineState.reboot(self, hard=hard)
MachineState.reboot(self, hard=hard, reset=reset)
self.ssh_pinged = False

def start(self):
Expand Down
2 changes: 1 addition & 1 deletion nixops/backends/ec2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1443,7 +1443,7 @@ def _check(self, res):
res.messages.append(" * {0} - {1}".format(e.not_before, e.not_after))


def reboot(self, hard=False):
def reboot(self, hard=False, reset=True):
self.log("rebooting EC2 machine...")
instance = self._get_instance()
instance.reboot()
Expand Down
4 changes: 2 additions & 2 deletions nixops/backends/gce.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,13 +557,13 @@ def create_node(self, defn):
self.on_host_maintenance = defn.on_host_maintenance


def reboot(self, hard=False):
def reboot(self, hard=False, reset=True):
if hard:
self.log("sending hard reset to GCE machine...")
self.node().reboot()
self.state = self.STARTING
else:
MachineState.reboot(self, hard=hard)
MachineState.reboot(self, hard=hard, reset=reset)


def start(self):
Expand Down
Loading