Skip to content

Commit

Permalink
kola: More fixes for reboot
Browse files Browse the repository at this point in the history
Followup to review comments in
#1575

And also for some reason I didn't fully debug, the reboot
acknowlegement was failing on RHCOS, so let's switch to
using FIFOs for IPC instead of doing "IPC via systemd DBus unit files".
  • Loading branch information
cgwalters authored and openshift-merge-robot committed Jul 13, 2020
1 parent 6833e46 commit 2d1366f
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 46 deletions.
2 changes: 1 addition & 1 deletion ci/run-kola-self-tests
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ set -euo pipefail
dn=$(cd $(dirname $0) && pwd)
toplevel=$(cd ${dn} && git rev-parse --show-toplevel)
set -x
kola run --output-dir tmp/kolaself -E ${toplevel}/tests/kola-ci-self 'ext.kola-ci-self*'
kola run --output-dir tmp/kolaself -E ${toplevel}/tests/kola-ci-self 'ext.kola-ci-self*' "$@"
log=$(find tmp/kolaself -type f -name kola-runext-autopkgtest-reboot.txt)
test -n "${log}"
if ! grep -qF -e 'ok autopkgtest rebooting' < "${log}"; then
Expand Down
88 changes: 67 additions & 21 deletions mantle/cmd/kolet/kolet.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,21 @@ const (
// login and unit also need to communicate to make this happen, because the channel
// between the harness and subject is SSH.
//
// The way this works today is that our implementation of the "reboot-prepare" binary
// writes the mark out to a file in /run and then starts a "sleep infinity" as a separate
// systemd unit and blocks on its termination.
// To implement communication, we use a "low tech" method of two FIFOs. (If we need
// more sophistication in the future, it would probably make sense for the login
// session to expose an API over a unix domain socket).
//
// The login notices this unit was started, reads the mark file, then prints out the reboot
// data on stdout, which the harness reads.
// When a reboot binary is invoked, it creates the "reboot acknowledge" FIFO in /run,
// then writes the mark out to a second FIFO that the login session is waiting on.
// The reboot binary then *synchronously* waits on a read from FIFO to signal
// acknowledgement for rebooting.
//
// When the login session finds data in the "reboot request" FIFO, it reads it
// and then prints out the reboot data on stdout, which the harness reads.
//
// The harness then creates a separate SSH session which stops sshd (to avoid any races
// around logging in again), and then stops the "sleep infinity" service.
// around logging in again), and then writes to the acknowlege FIFO,
// allowing the reboot binary to continue.
//
// At this point, the "mark" (or state saved between reboots) is safely on the harness,
// so the test code can invoke e.g. `reboot` or `reboot -ff` etc.
Expand All @@ -88,7 +94,7 @@ const (
const (
autopkgTestRebootPath = "/tmp/autopkgtest-reboot"
autopkgtestRebootScript = `#!/bin/bash
set -euo pipefail
set -xeuo pipefail
~core/kolet reboot-request $1
reboot
`
Expand All @@ -100,7 +106,7 @@ exec ~core/kolet reboot-request $1
`

// File used to communicate between the script and the kolet runner internally
rebootStamp = "/run/kolet-reboot"
rebootRequestFifo = "/run/kolet-reboot"
)

var (
Expand Down Expand Up @@ -226,13 +232,10 @@ func dispatchRunExtUnit(unitname string, sdconn *systemddbus.Conn) (bool, error)
}
}

func initiateReboot() error {
contents, err := ioutil.ReadFile(rebootStamp)
if err != nil {
return err
}
func initiateReboot(mark string) error {
systemdjournal.Print(systemdjournal.PriInfo, "Processing reboot request")
res := kola.KoletResult{
Reboot: string(contents),
Reboot: string(mark),
}
buf, err := json.Marshal(&res)
if err != nil {
Expand All @@ -252,6 +255,28 @@ func runExtUnit(cmd *cobra.Command, args []string) error {
return err
}

// Create the reboot cmdline -> login FIFO for the reboot mark and
// proxy it into a channel
rebootChan := make(chan string)
errChan := make(chan error)
err := exec.Command("mkfifo", rebootRequestFifo).Run()
if err != nil {
return err
}
go func() {
rebootReader, err := os.Open(rebootRequestFifo)
if err != nil {
errChan <- err
return
}
defer rebootReader.Close()
buf, err := ioutil.ReadAll(rebootReader)
if err != nil {
errChan <- err
}
rebootChan <- string(buf)
}()

unitname := args[0]
// Restrict this to services, don't need to support anything else right now
if !strings.HasSuffix(unitname, ".service") {
Expand All @@ -271,23 +296,36 @@ func runExtUnit(cmd *cobra.Command, args []string) error {
if err := sdconn.Subscribe(); err != nil {
return err
}
// Check the status now to avoid any race conditions
dispatchRunExtUnit(unitname, sdconn)
unitevents, uniterrs := sdconn.SubscribeUnits(time.Second)
// Watch for changes in the target unit
filterFunc := func(n string) bool {
return n != unitname
}
compareFunc := func(u1, u2 *systemddbus.UnitStatus) bool { return *u1 != *u2 }
unitevents, uniterrs := sdconn.SubscribeUnitsCustom(time.Second, 0, compareFunc, filterFunc)

for {
systemdjournal.Print(systemdjournal.PriInfo, "Awaiting events")
select {
case err := <-errChan:
return err
case reboot := <-rebootChan:
return initiateReboot(reboot)
case m := <-unitevents:
for n := range m {
if n == unitname {
systemdjournal.Print(systemdjournal.PriInfo, "Dispatching %s", n)
r, err := dispatchRunExtUnit(unitname, sdconn)
systemdjournal.Print(systemdjournal.PriInfo, "Done dispatching %s", n)
if err != nil {
return err
}
if r {
return nil
}
} else if n == kola.KoletRebootWaitUnit {
return initiateReboot()
} else {
systemdjournal.Print(systemdjournal.PriInfo, "Unexpected event %v", n)
}
}
case m := <-uniterrs:
Expand All @@ -302,14 +340,22 @@ func runExtUnit(cmd *cobra.Command, args []string) error {
func runReboot(cmd *cobra.Command, args []string) error {
mark := args[0]
systemdjournal.Print(systemdjournal.PriInfo, "Requesting reboot with mark: %s", mark)
err := ioutil.WriteFile(rebootStamp, []byte(mark), 0644)
err := exec.Command("mkfifo", kola.KoletRebootAckFifo).Run()
if err != nil {
return err
}
err = ioutil.WriteFile(rebootRequestFifo, []byte(mark), 0644)
if err != nil {
return err
}
// Synchronously wait until the mark is propagated back to the harness
err = exec.Command("systemd-run", "-q", "--wait", "--unit", kola.KoletRebootWaitUnit, "--", "sleep", "infinity").Run()
f, err := os.Open(kola.KoletRebootAckFifo)
if err != nil {
return errors.Wrapf(err, "starting %s", kola.KoletRebootWaitUnit)
return err
}
buf := make([]byte, 1)
_, err = f.Read(buf)
if err != nil {
return err
}
systemdjournal.Print(systemdjournal.PriInfo, "Reboot request acknowledged")
return nil
Expand Down
47 changes: 23 additions & 24 deletions mantle/kola/harness.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,10 +190,8 @@ type KoletResult struct {
Reboot string
}

// KoletRebootUnit is a systemd unit that sleeps until the harness acknowledges
// a reboot request by stopping it.
const KoletRebootWaitUnit = "kolet-reboot-wait.service"
const KoletExtTestUnit = "kola-runext.service"
const KoletRebootAckFifo = "/run/kolet-reboot-ack"

// NativeRunner is a closure passed to all kola test functions and used
// to run native go functions directly on kola machines. It is necessary
Expand Down Expand Up @@ -551,6 +549,9 @@ func metadataFromTestBinary(executable string) (*externalTestMeta, error) {
return meta, nil
}

// runExternalTest is an implmenetation of the "external" test framework.
// See README-kola-ext.md as well as the comments in kolet.go for reboot
// handling.
func runExternalTest(c cluster.TestCluster, mach platform.Machine) error {
var previousRebootState string
var stdout []byte
Expand All @@ -566,7 +567,6 @@ func runExternalTest(c cluster.TestCluster, mach platform.Machine) error {
c.MustSSHf(mach, "echo AUTOPKGTEST_REBOOT_MARK=%s | sudo tee /run/kola-runext-env", previousRebootState)
previousRebootState = ""
}
// We ignore stderr here, it will end up in the journal
stdout, stderr, err = mach.SSH(fmt.Sprintf("sudo ./kolet run-test-unit %s", shellquote.Join(KoletExtTestUnit)))
if err != nil {
return errors.Wrapf(err, "kolet run-test-unit failed: %s", stderr)
Expand All @@ -578,28 +578,27 @@ func runExternalTest(c cluster.TestCluster, mach platform.Machine) error {
return errors.Wrapf(err, "parsing kolet json %s", string(stdout))
}
}
if koletRes.Reboot != "" {
previousRebootState = koletRes.Reboot
plog.Debugf("Reboot request with mark='%s'", previousRebootState)
// Stop sshd to ensure we don't race trying to log back in during shutdown
_, _, err = mach.SSH(fmt.Sprintf("sudo /bin/sh -c 'systemctl stop sshd && systemctl stop %s'", KoletRebootWaitUnit))
if err != nil {
return errors.Wrapf(err, "failed to stop %s", KoletRebootWaitUnit)
}
plog.Debug("Waiting for reboot")
suberr := mach.WaitForReboot(120*time.Second, bootID)
if suberr == nil {
plog.Debug("Reboot complete")
err = nil
continue
} else {
return errors.Wrapf(suberr, "Waiting for reboot")
}
}
// Otherwise, we're done
if err == nil {
// If no reboot is requested, we're done
if koletRes.Reboot == "" {
return nil
}

// A reboot is requested
previousRebootState = koletRes.Reboot
plog.Debugf("Reboot request with mark='%s'", previousRebootState)
// This signals to the subject that we have saved the mark, and the subject
// can proceed with rebooting. We stop sshd to ensure that the wait below
// doesn't log in while ssh is shutting down.
_, _, err = mach.SSH(fmt.Sprintf("sudo /bin/sh -c 'systemctl stop sshd && echo > %s'", KoletRebootAckFifo))
if err != nil {
return errors.Wrapf(err, "failed to acknowledge reboot")
}
plog.Debug("Waiting for reboot")
err = mach.WaitForReboot(120*time.Second, bootID)
if err != nil {
return errors.Wrapf(err, "Waiting for reboot")
}
plog.Debug("Reboot complete")
}
}

Expand Down

0 comments on commit 2d1366f

Please sign in to comment.