Skip to content

Commit

Permalink
Merge pull request #433 from stgraber/main
Browse files Browse the repository at this point in the history
Add more evacuate/shutdown options
  • Loading branch information
tych0 authored Jan 25, 2024
2 parents c2e4d78 + bebcc75 commit c23f139
Show file tree
Hide file tree
Showing 13 changed files with 166 additions and 89 deletions.
102 changes: 62 additions & 40 deletions cmd/incusd/api_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ import (
"github.com/lxc/incus/shared/validate"
)

type evacuateStopFunc func(inst instance.Instance) error
type evacuateStopFunc func(inst instance.Instance, action string) error
type evacuateMigrateFunc func(s *state.State, r *http.Request, inst instance.Instance, targetMemberInfo *db.NodeInfo, live bool, startInstance bool, metadata map[string]any, op *operations.Operation) error

type evacuateOpts struct {
Expand Down Expand Up @@ -2994,39 +2994,63 @@ func clusterNodeStatePost(d *Daemon, r *http.Request) response.Response {

s := d.State()

// Forward request
// Forward request.
resp := forwardedResponseToNode(s, r, name)
if resp != nil {
return resp
}

// Parse the request
// Parse the request.
req := api.ClusterMemberStatePost{}
err = json.NewDecoder(r.Body).Decode(&req)
if err != nil {
return response.BadRequest(err)
}

// Validate the overrides.
if req.Action == "evacuate" && req.Mode != "" {
// Use the validator from the instance logic.
validator := internalInstance.InstanceConfigKeysAny["cluster.evacuate"]
err = validator(req.Mode)
if err != nil {
return response.BadRequest(err)
}
}

if req.Action == "evacuate" {
stopFunc := func(inst instance.Instance) error {
stopFunc := func(inst instance.Instance, action string) error {
l := logger.AddContext(logger.Ctx{"project": inst.Project().Name, "instance": inst.Name()})

// Get the shutdown timeout for the instance.
timeout := inst.ExpandedConfig()["boot.host_shutdown_timeout"]
val, err := strconv.Atoi(timeout)
if err != nil {
val = evacuateHostShutdownDefaultTimeout
}

// Start with a clean shutdown.
err = inst.Shutdown(time.Duration(val) * time.Second)
if err != nil {
l.Warn("Failed shutting down instance, forcing stop", logger.Ctx{"err": err})

// Fallback to forced stop.
if action == "force-stop" {
// Handle forced shutdown.
err = inst.Stop(false)
if err != nil && !errors.Is(err, instanceDrivers.ErrInstanceIsStopped) {
return fmt.Errorf("Failed to stop instance %q in project %q: %w", inst.Name(), inst.Project().Name, err)
return fmt.Errorf("Failed to force stop instance %q in project %q: %w", inst.Name(), inst.Project().Name, err)
}
} else if action == "stateful-stop" {
// Handle stateful stop.
err = inst.Stop(true)
if err != nil && !errors.Is(err, instanceDrivers.ErrInstanceIsStopped) {
return fmt.Errorf("Failed to stateful stop instance %q in project %q: %w", inst.Name(), inst.Project().Name, err)
}
} else {
// Get the shutdown timeout for the instance.
timeout := inst.ExpandedConfig()["boot.host_shutdown_timeout"]
val, err := strconv.Atoi(timeout)
if err != nil {
val = evacuateHostShutdownDefaultTimeout
}

// Start with a clean shutdown.
err = inst.Shutdown(time.Duration(val) * time.Second)
if err != nil {
l.Warn("Failed shutting down instance, forcing stop", logger.Ctx{"err": err})

// Fallback to forced stop.
err = inst.Stop(false)
if err != nil && !errors.Is(err, instanceDrivers.ErrInstanceIsStopped) {
return fmt.Errorf("Failed to stop instance %q in project %q: %w", inst.Name(), inst.Project().Name, err)
}
}
}

Expand Down Expand Up @@ -3283,37 +3307,28 @@ func evacuateInstances(ctx context.Context, opts evacuateOpts) error {
l := logger.AddContext(logger.Ctx{"project": instProject.Name, "instance": inst.Name()})

// Check if migratable.
migrate, live := inst.CanMigrate()
action := inst.CanMigrate()

// Apply overrides.
if opts.mode != "" {
if opts.mode == "stop" {
migrate = false
live = false
} else if opts.mode == "migrate" {
migrate = true
live = false
} else if opts.mode == "live-migrate" {
migrate = true
live = true
}
if opts.mode != "" && opts.mode != "auto" {
action = opts.mode
}

// Stop the instance if needed.
isRunning := inst.IsRunning()
if opts.stopInstance != nil && isRunning && !(migrate && live) {
if opts.stopInstance != nil && isRunning && action != "live-migrate" {
metadata["evacuation_progress"] = fmt.Sprintf("Stopping %q in project %q", inst.Name(), instProject.Name)
_ = opts.op.UpdateMetadata(metadata)

err := opts.stopInstance(inst)
err := opts.stopInstance(inst, action)
if err != nil {
return err
}
}

// If not migratable, the instance is just stopped.
if !migrate {
continue
if action != "migrate" {
// Done with this instance.
continue
}
}

// Get candidate cluster members to move instances to.
Expand Down Expand Up @@ -3354,7 +3369,7 @@ func evacuateInstances(ctx context.Context, opts evacuateOpts) error {
}

start := isRunning || instanceShouldAutoStart(inst)
err = opts.migrateInstance(opts.s, opts.r, inst, targetMemberInfo, live, start, metadata, opts.op)
err = opts.migrateInstance(opts.s, opts.r, inst, targetMemberInfo, action == "live-migrate", start, metadata, opts.op)
if err != nil {
return err
}
Expand Down Expand Up @@ -3445,7 +3460,14 @@ func restoreClusterMember(d *Daemon, r *http.Request) response.Response {
metadata["evacuation_progress"] = fmt.Sprintf("Starting %q in project %q", inst.Name(), inst.Project().Name)
_ = op.UpdateMetadata(metadata)

err = inst.Start(false)
// If configured for stateful stop, try restoring its state.
action := inst.CanMigrate()
if action == "stateful-stop" {
err = inst.Start(true)
} else {
err = inst.Start(false)
}

if err != nil {
return fmt.Errorf("Failed to start instance %q: %w", inst.Name(), err)
}
Expand All @@ -3455,8 +3477,8 @@ func restoreClusterMember(d *Daemon, r *http.Request) response.Response {
for _, inst := range instances {
l := logger.AddContext(logger.Ctx{"project": inst.Project().Name, "instance": inst.Name()})

// Check if live-migratable.
_, live := inst.CanMigrate()
// Check the action.
live := inst.CanMigrate() == "live-migrate"

metadata["evacuation_progress"] = fmt.Sprintf("Migrating %q in project %q from %q", inst.Name(), inst.Project().Name, inst.Location())
_ = op.UpdateMetadata(metadata)
Expand Down
33 changes: 28 additions & 5 deletions cmd/incusd/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,14 +213,24 @@ func instancesStart(s *state.State, instances []instance.Instance) {
// Get the instance config.
config := inst.ExpandedConfig()
autoStartDelay := config["boot.autostart.delay"]
shutdownAction := config["boot.host_shutdown_action"]

instLogger := logger.AddContext(logger.Ctx{"project": inst.Project().Name, "instance": inst.Name()})

// Try to start the instance.
var attempt = 0
for {
attempt++
err := inst.Start(false)

var err error
if shutdownAction == "stateful-stop" {
// Attempt to restore state.
err = inst.Start(true)
} else {
// Normal startup.
err = inst.Start(false)
}

if err != nil {
if api.StatusErrorCheck(err, http.StatusServiceUnavailable) {
break // Don't log or retry instances that are not ready to start yet.
Expand Down Expand Up @@ -379,13 +389,26 @@ func instancesShutdown(s *state.State, instances []instance.Instance) {
timeoutSeconds, _ = strconv.Atoi(value)
}

err := inst.Shutdown(time.Second * time.Duration(timeoutSeconds))
if err != nil {
logger.Warn("Failed shutting down instance, forcefully stopping", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "err": err})
err = inst.Stop(false)
action := inst.ExpandedConfig()["boot.host_shutdown_action"]
if action == "stateful-stop" {
err := inst.Stop(true)
if err != nil {
logger.Warn("Failed statefully stopping instance", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "err": err})
}
} else if action == "force-stop" {
err := inst.Stop(false)
if err != nil {
logger.Warn("Failed forcefully stopping instance", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "err": err})
}
} else {
err := inst.Shutdown(time.Second * time.Duration(timeoutSeconds))
if err != nil {
logger.Warn("Failed shutting down instance, forcefully stopping", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "err": err})
err = inst.Stop(false)
if err != nil {
logger.Warn("Failed forcefully stopping instance", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "err": err})
}
}
}

if inst.ID() > 0 {
Expand Down
12 changes: 12 additions & 0 deletions doc/api-extensions.md
Original file line number Diff line number Diff line change
Expand Up @@ -2318,3 +2318,15 @@ Adds a new `start` field to the `POST /1.0/instances` API which when set
to `true` will have the instance automatically start upon creation.

In this scenario, the creation and startup is part of a single background operation.

## `clustering_evacuation_stop_options`

This introduces new options for the `cluster.evacuate` option:

* `stateful-stop` has the instance store its state to disk to be resume on restore.
* `force-stop` has the instance immediately stopped without waiting for it to shut down.

## `boot_host_shutdown_action`

This introduces a new `boot.host_shutdown_action` instance configuration key which can be used to override the default `stop` behavior on system shutdown.
It supports the value `stop`, `stateful-stop` and `force-stop`.
17 changes: 14 additions & 3 deletions doc/config_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,14 @@ The number of seconds to wait after the instance started before starting the nex
The instance with the highest value is started first.
```

```{config:option} boot.host_shutdown_action instance-boot
:defaultdesc: "stop"
:liveupdate: "yes"
:shortdesc: "What action to take on the instance when the host is shut down"
:type: "integer"
Action to take on host shut down
```

```{config:option} boot.host_shutdown_timeout instance-boot
:defaultdesc: "30"
:liveupdate: "yes"
Expand Down Expand Up @@ -176,12 +184,15 @@ Available Modes:
+ If any device is not suitable for migration, the instance will not be migrated (only stopped).
+ Live migration will be used only for virtual machines with the `migration.stateful` setting
enabled and for which all its devices can be migrated as well.
- `live-migrate`: Instances are live-migrated to another node. This means the instance remains running
- `live-migrate`: Instances are live-migrated to another server. This means the instance remains running
and operational during the migration process, ensuring minimal disruption.
- `migrate`: In this mode, instances are migrated to another node in the cluster. The migration
- `migrate`: In this mode, instances are migrated to another server in the cluster. The migration
process will not be live, meaning there will be a brief downtime for the instance during the
migration.
- `stop`: Instances are not migrated. Instead, they are stopped on the current node.
- `stop`: Instances are not migrated. Instead, they are stopped on the current server.
- `stateful-stop`: Instances are not migrated. Instead, they are stopped on the current server
but with their runtime state (memory) stored on disk for resuming on restore.
- `force-stop`: Instances are not migrated. Instead, they are forcefully stopped.

See {ref}`cluster-evacuate` for more information.
```
Expand Down
20 changes: 16 additions & 4 deletions internal/instance/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,15 @@ var InstanceConfigKeysAny = map[string]func(value string) error{
// shortdesc: What order to shut down the instances in
"boot.stop.priority": validate.Optional(validate.IsInt64),

// gendoc:generate(entity=instance, group=boot, key=boot.host_shutdown_action)
// Action to take on host shut down
// ---
// type: integer
// defaultdesc: stop
// liveupdate: yes
// shortdesc: What action to take on the instance when the host is shut down
"boot.host_shutdown_action": validate.Optional(validate.IsOneOf("stop", "force-stop", "stateful-stop")),

// gendoc:generate(entity=instance, group=boot, key=boot.host_shutdown_timeout)
// Number of seconds to wait for the instance to shut down before it is force-stopped.
// ---
Expand Down Expand Up @@ -139,20 +148,23 @@ var InstanceConfigKeysAny = map[string]func(value string) error{
// + If any device is not suitable for migration, the instance will not be migrated (only stopped).
// + Live migration will be used only for virtual machines with the `migration.stateful` setting
// enabled and for which all its devices can be migrated as well.
// - `live-migrate`: Instances are live-migrated to another node. This means the instance remains running
// - `live-migrate`: Instances are live-migrated to another server. This means the instance remains running
// and operational during the migration process, ensuring minimal disruption.
// - `migrate`: In this mode, instances are migrated to another node in the cluster. The migration
// - `migrate`: In this mode, instances are migrated to another server in the cluster. The migration
// process will not be live, meaning there will be a brief downtime for the instance during the
// migration.
// - `stop`: Instances are not migrated. Instead, they are stopped on the current node.
// - `stop`: Instances are not migrated. Instead, they are stopped on the current server.
// - `stateful-stop`: Instances are not migrated. Instead, they are stopped on the current server
// but with their runtime state (memory) stored on disk for resuming on restore.
// - `force-stop`: Instances are not migrated. Instead, they are forcefully stopped.
//
// See {ref}`cluster-evacuate` for more information.
// ---
// type: string
// defaultdesc: `auto`
// liveupdate: no
// shortdesc: What to do when evacuating the instance
"cluster.evacuate": validate.Optional(validate.IsOneOf("auto", "migrate", "live-migrate", "stop")),
"cluster.evacuate": validate.Optional(validate.IsOneOf("auto", "migrate", "live-migrate", "stop", "stateful-stop", "force-stop")),

// gendoc:generate(entity=instance, group=resource-limits, key=limits.cpu)
// A number or a specific range of CPUs to expose to the instance.
Expand Down
30 changes: 10 additions & 20 deletions internal/server/instance/drivers/driver_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -914,27 +914,18 @@ func (d *common) warningsDelete() error {
return nil
}

// canMigrate determines if the given instance can be migrated and whether the migration
// can be live. In "auto" mode, the function checks each attached device of the instance
// to ensure they are all migratable.
func (d *common) canMigrate(inst instance.Instance) (bool, bool) {
// canMigrate determines if the given instance can be migrated and what kind of migration to attempt.
func (d *common) canMigrate(inst instance.Instance) string {
// Check policy for the instance.
config := d.ExpandedConfig()
val, ok := config["cluster.evacuate"]
if !ok {
val = "auto"
}

if val == "migrate" {
return true, false
}

if val == "live-migrate" {
return true, true
}

if val == "stop" {
return false, false
// If not using auto, just return the migration type.
if val != "auto" {
return val
}

// Look at attached devices.
Expand All @@ -944,23 +935,22 @@ func (d *common) canMigrate(inst instance.Instance) (bool, bool) {
dev, err := device.New(inst, d.state, deviceName, rawConfig, volatileGet, volatileSet)
if err != nil {
logger.Warn("Instance will not be migrated due to a device error", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "device": dev.Name(), "err": err})
return false, false
return "stop"
}

if !dev.CanMigrate() {
logger.Warn("Instance will not be migrated because its device cannot be migrated", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "device": dev.Name()})
return false, false
return "stop"
}
}

// Check if set up for live migration.
// Limit automatic live-migration to virtual machines for now.
live := false
if inst.Type() == instancetype.VM {
live = util.IsTrue(config["migration.stateful"])
if inst.Type() == instancetype.VM && util.IsTrue(config["migration.stateful"]) {
return "live-migrate"
}

return true, live
return "migrate"
}

// recordLastState records last power and used time into local config and database config.
Expand Down
Loading

0 comments on commit c23f139

Please sign in to comment.