Skip to content

Commit

Permalink
Use bfq scheduler on control plane, idle I/O for rpm-ostreed
Browse files Browse the repository at this point in the history
Part of solving openshift#1897
A lot more details in https://hackmd.io/WeqiDWMAQP2sNtuPRul9QA

The TL;DR is that the `bfq` I/O scheduler better respects IO priorities,
and also does a better job of handling latency sensitive processes
like `etcd` versus bulk/background I/O .
  • Loading branch information
cgwalters committed Aug 6, 2020
1 parent 01e62b8 commit e04f6d1
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 4 deletions.
79 changes: 79 additions & 0 deletions pkg/daemon/controlplane.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
package daemon

// This file provides changes that we make to the control plane
// only.

import (
"fmt"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
"strings"

"github.com/golang/glog"
"github.com/pkg/errors"
)

// setRootDeviceSchedulerBFQ switches to the `bfq` I/O scheduler
// for the root block device to better share I/O between etcd
// and other processes. See
// https://github.com/openshift/machine-config-operator/issues/1897
// Note this is the current systemd default in Fedora, but not RHEL8,
// except for NVMe devices.
func setRootDeviceSchedulerBFQ() error {
sched := "bfq"

rootDevSysfs, err := getRootBlockDeviceSysfs()
if err != nil {
return err
}

schedulerPath := filepath.Join(rootDevSysfs, "/queue/scheduler")
schedulerContentsBuf, err := ioutil.ReadFile(schedulerPath)
if err != nil {
return err
}
schedulerContents := string(schedulerContentsBuf)
if strings.Contains(schedulerContents, fmt.Sprintf("[%s]", sched)) {
glog.Infof("Device %s already uses scheduler %s", rootDevSysfs, sched)
return nil
}

f, err := os.OpenFile(schedulerPath, os.O_WRONLY|os.O_TRUNC, 0644)
if err != nil {
return err
}
defer f.Close()
_, err = f.Write([]byte(sched))
if err != nil {
return err
}
glog.Infof("Set root blockdev %s to use scheduler %v", rootDevSysfs, sched)

return nil
}

// updateOstreeObjectSync enables "per-object-fsync" which helps avoid
// latency spikes for etcd; see https://github.com/ostreedev/ostree/pull/2152
func updateOstreeObjectSync() error {
if err := exec.Command("ostree", "--repo=/sysroot/ostree/repo", "config", "set", "core.per-object-fsync", "true").Run(); err != nil {
return errors.Wrapf(err, "Failed to set per-object-fsync for ostree")
}
return nil
}

// initializeControlPlane performs setup for the node that should
// only occur on the control plane. Currently this switches the IO
// scheduler and starts a goroutine acting as a small controller
// for reflecting the etcd leader status in the node object to help
// the MCC coordinate control plane updates.
func (dn *Daemon) initializeControlPlane() error {
if err := setRootDeviceSchedulerBFQ(); err != nil {
return err
}
if err := updateOstreeObjectSync(); err != nil {
return err
}
return nil
}
52 changes: 52 additions & 0 deletions pkg/daemon/coreos.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package daemon

// This file provides routines that apply on Fedora CoreOS style systems,
// including deriviatives like RHEL CoreOS.

import (
"fmt"
"os"
"path/filepath"

"github.com/pkg/errors"
)

// byLabel returns the udev generated symlink to the block device with the given label
func byLabel(label string) string {
return fmt.Sprintf("/dev/disk/by-label/%s", label)
}

// getParentDeviceSysfs returns e.g. /sys/devices/pci0000:00/0000:00:05.0/virtio2/block/vda from /dev/vda4, though
// it can be more complex than that with e.g. NVMe.
func getParentDeviceSysfs(device string) (string, error) {
target, err := os.Readlink(device)
if err != nil {
return "", errors.Wrapf(err, "reading %s", device)
}
sysfsDevLink := fmt.Sprintf("/sys/class/block/%s", filepath.Base(target))
sysfsDev, err := filepath.EvalSymlinks(sysfsDevLink)
if err != nil {
return "", errors.Wrapf(err, "parsing %s", sysfsDevLink)
}
if _, err := os.Stat(filepath.Join(sysfsDev, "partition")); err == nil {
sysfsDev = filepath.Dir(sysfsDev)
}
return sysfsDev, nil
}

// getRootBlockDeviceSysfs returns the path to the block
// device backing the root partition on a FCOS system
func getRootBlockDeviceSysfs() (string, error) {
// Check for the `crypt_rootfs` label; this exists for RHCOS >= 4.3 but <= 4.6.
// See https://github.com/openshift/enhancements/blob/master/enhancements/rhcos/automated-policy-based-disk-encryption.md
luksRoot := byLabel("crypt_rootfs")
if _, err := os.Stat(luksRoot); err == nil {
return getParentDeviceSysfs(luksRoot)
}
// This is what we expect on FCOS and RHCOS <= 4.2
root := byLabel("root")
if _, err := os.Stat(root); err == nil {
return getParentDeviceSysfs(root)
}
return "", fmt.Errorf("Failed to find %s", root)
}
24 changes: 20 additions & 4 deletions pkg/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,12 @@ type Daemon struct {
// isControlPlane is true if this node is a control plane (master).
// The machine may also be a worker (with schedulable masters).
isControlPlane bool
booting bool
// nodeInitialized is true when we've performed one-time initialization
// after having updated the node object
nodeInitialized bool
// booting is true when all initial synchronization to the target
// machineconfig is done
booting bool

currentConfigPath string

Expand Down Expand Up @@ -382,15 +387,24 @@ func (dn *Daemon) updateErrorState(err error) {
}
}

// initializeNode is called the first time we get our node object
func (dn *Daemon) initializeNode() {
// initializeNode is called the first time we get our node object; however to
// ensure we handle failures: everything called from here should be idempotent.
func (dn *Daemon) initializeNode() error {
if dn.nodeInitialized {
return nil
}
// Some parts of the MCO dispatch on whether or not we're managing a control plane node
if _, isControlPlane := dn.node.Labels[ctrlcommon.MasterLabel]; isControlPlane {
glog.Infof("Node %s is part of the control plane", dn.node.Name)
if err := dn.initializeControlPlane(); err != nil {
return err
}
dn.isControlPlane = true
} else {
glog.Infof("Node %s is not labeled %s", dn.node.Name, ctrlcommon.MasterLabel)
}
dn.nodeInitialized = true
return nil
}

func (dn *Daemon) syncNode(key string) error {
Expand Down Expand Up @@ -428,7 +442,9 @@ func (dn *Daemon) syncNode(key string) error {
node = node.DeepCopy()
if dn.node == nil {
dn.node = node
dn.initializeNode()
if err := dn.initializeNode(); err != nil {
return err
}
} else {
dn.node = node
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name: rpm-ostreed.service
dropins:
- name: mco-controlplane-nice.conf
contents: |
# See https://github.com/openshift/machine-config-operator/issues/1897
[Service]
Nice=10
IOSchedulingClass=idle

0 comments on commit e04f6d1

Please sign in to comment.