Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add more evacuate/shutdown options #433

Merged
merged 16 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 62 additions & 40 deletions cmd/incusd/api_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ import (
"github.com/lxc/incus/shared/validate"
)

type evacuateStopFunc func(inst instance.Instance) error
type evacuateStopFunc func(inst instance.Instance, action string) error
type evacuateMigrateFunc func(s *state.State, r *http.Request, inst instance.Instance, targetMemberInfo *db.NodeInfo, live bool, startInstance bool, metadata map[string]any, op *operations.Operation) error

type evacuateOpts struct {
Expand Down Expand Up @@ -2994,39 +2994,63 @@ func clusterNodeStatePost(d *Daemon, r *http.Request) response.Response {

s := d.State()

// Forward request
// Forward request.
resp := forwardedResponseToNode(s, r, name)
if resp != nil {
return resp
}

// Parse the request
// Parse the request.
req := api.ClusterMemberStatePost{}
err = json.NewDecoder(r.Body).Decode(&req)
if err != nil {
return response.BadRequest(err)
}

// Validate the overrides.
if req.Action == "evacuate" && req.Mode != "" {
// Use the validator from the instance logic.
validator := internalInstance.InstanceConfigKeysAny["cluster.evacuate"]
err = validator(req.Mode)
if err != nil {
return response.BadRequest(err)
}
}

if req.Action == "evacuate" {
stopFunc := func(inst instance.Instance) error {
stopFunc := func(inst instance.Instance, action string) error {
l := logger.AddContext(logger.Ctx{"project": inst.Project().Name, "instance": inst.Name()})

// Get the shutdown timeout for the instance.
timeout := inst.ExpandedConfig()["boot.host_shutdown_timeout"]
val, err := strconv.Atoi(timeout)
if err != nil {
val = evacuateHostShutdownDefaultTimeout
}

// Start with a clean shutdown.
err = inst.Shutdown(time.Duration(val) * time.Second)
if err != nil {
l.Warn("Failed shutting down instance, forcing stop", logger.Ctx{"err": err})

// Fallback to forced stop.
if action == "force-stop" {
// Handle forced shutdown.
err = inst.Stop(false)
if err != nil && !errors.Is(err, instanceDrivers.ErrInstanceIsStopped) {
return fmt.Errorf("Failed to stop instance %q in project %q: %w", inst.Name(), inst.Project().Name, err)
return fmt.Errorf("Failed to force stop instance %q in project %q: %w", inst.Name(), inst.Project().Name, err)
}
} else if action == "stateful-stop" {
// Handle stateful stop.
err = inst.Stop(true)
if err != nil && !errors.Is(err, instanceDrivers.ErrInstanceIsStopped) {
return fmt.Errorf("Failed to stateful stop instance %q in project %q: %w", inst.Name(), inst.Project().Name, err)
}
} else {
// Get the shutdown timeout for the instance.
timeout := inst.ExpandedConfig()["boot.host_shutdown_timeout"]
val, err := strconv.Atoi(timeout)
if err != nil {
val = evacuateHostShutdownDefaultTimeout
}

// Start with a clean shutdown.
err = inst.Shutdown(time.Duration(val) * time.Second)
if err != nil {
l.Warn("Failed shutting down instance, forcing stop", logger.Ctx{"err": err})

// Fallback to forced stop.
err = inst.Stop(false)
if err != nil && !errors.Is(err, instanceDrivers.ErrInstanceIsStopped) {
return fmt.Errorf("Failed to stop instance %q in project %q: %w", inst.Name(), inst.Project().Name, err)
}
}
}

Expand Down Expand Up @@ -3283,37 +3307,28 @@ func evacuateInstances(ctx context.Context, opts evacuateOpts) error {
l := logger.AddContext(logger.Ctx{"project": instProject.Name, "instance": inst.Name()})

// Check if migratable.
migrate, live := inst.CanMigrate()
action := inst.CanMigrate()

// Apply overrides.
if opts.mode != "" {
if opts.mode == "stop" {
migrate = false
live = false
} else if opts.mode == "migrate" {
migrate = true
live = false
} else if opts.mode == "live-migrate" {
migrate = true
live = true
}
if opts.mode != "" && opts.mode != "auto" {
action = opts.mode
}

// Stop the instance if needed.
isRunning := inst.IsRunning()
if opts.stopInstance != nil && isRunning && !(migrate && live) {
if opts.stopInstance != nil && isRunning && action != "live-migrate" {
metadata["evacuation_progress"] = fmt.Sprintf("Stopping %q in project %q", inst.Name(), instProject.Name)
_ = opts.op.UpdateMetadata(metadata)

err := opts.stopInstance(inst)
err := opts.stopInstance(inst, action)
if err != nil {
return err
}
}

// If not migratable, the instance is just stopped.
if !migrate {
continue
if action != "migrate" {
// Done with this instance.
continue
}
}

// Get candidate cluster members to move instances to.
Expand Down Expand Up @@ -3354,7 +3369,7 @@ func evacuateInstances(ctx context.Context, opts evacuateOpts) error {
}

start := isRunning || instanceShouldAutoStart(inst)
err = opts.migrateInstance(opts.s, opts.r, inst, targetMemberInfo, live, start, metadata, opts.op)
err = opts.migrateInstance(opts.s, opts.r, inst, targetMemberInfo, action == "live-migrate", start, metadata, opts.op)
if err != nil {
return err
}
Expand Down Expand Up @@ -3445,7 +3460,14 @@ func restoreClusterMember(d *Daemon, r *http.Request) response.Response {
metadata["evacuation_progress"] = fmt.Sprintf("Starting %q in project %q", inst.Name(), inst.Project().Name)
_ = op.UpdateMetadata(metadata)

err = inst.Start(false)
// If configured for stateful stop, try restoring its state.
action := inst.CanMigrate()
if action == "stateful-stop" {
err = inst.Start(true)
} else {
err = inst.Start(false)
}

if err != nil {
return fmt.Errorf("Failed to start instance %q: %w", inst.Name(), err)
}
Expand All @@ -3455,8 +3477,8 @@ func restoreClusterMember(d *Daemon, r *http.Request) response.Response {
for _, inst := range instances {
l := logger.AddContext(logger.Ctx{"project": inst.Project().Name, "instance": inst.Name()})

// Check if live-migratable.
_, live := inst.CanMigrate()
// Check the action.
live := inst.CanMigrate() == "live-migrate"

metadata["evacuation_progress"] = fmt.Sprintf("Migrating %q in project %q from %q", inst.Name(), inst.Project().Name, inst.Location())
_ = op.UpdateMetadata(metadata)
Expand Down
33 changes: 28 additions & 5 deletions cmd/incusd/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,14 +213,24 @@ func instancesStart(s *state.State, instances []instance.Instance) {
// Get the instance config.
config := inst.ExpandedConfig()
autoStartDelay := config["boot.autostart.delay"]
shutdownAction := config["boot.host_shutdown_action"]

instLogger := logger.AddContext(logger.Ctx{"project": inst.Project().Name, "instance": inst.Name()})

// Try to start the instance.
var attempt = 0
for {
attempt++
err := inst.Start(false)

var err error
if shutdownAction == "stateful-stop" {
// Attempt to restore state.
err = inst.Start(true)
} else {
// Normal startup.
err = inst.Start(false)
}

if err != nil {
if api.StatusErrorCheck(err, http.StatusServiceUnavailable) {
break // Don't log or retry instances that are not ready to start yet.
Expand Down Expand Up @@ -379,13 +389,26 @@ func instancesShutdown(s *state.State, instances []instance.Instance) {
timeoutSeconds, _ = strconv.Atoi(value)
}

err := inst.Shutdown(time.Second * time.Duration(timeoutSeconds))
if err != nil {
logger.Warn("Failed shutting down instance, forcefully stopping", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "err": err})
err = inst.Stop(false)
action := inst.ExpandedConfig()["boot.host_shutdown_action"]
if action == "stateful-stop" {
err := inst.Stop(true)
if err != nil {
logger.Warn("Failed statefully stopping instance", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "err": err})
}
} else if action == "force-stop" {
err := inst.Stop(false)
if err != nil {
logger.Warn("Failed forcefully stopping instance", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "err": err})
}
} else {
err := inst.Shutdown(time.Second * time.Duration(timeoutSeconds))
if err != nil {
logger.Warn("Failed shutting down instance, forcefully stopping", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "err": err})
err = inst.Stop(false)
if err != nil {
logger.Warn("Failed forcefully stopping instance", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "err": err})
}
}
}

if inst.ID() > 0 {
Expand Down
12 changes: 12 additions & 0 deletions doc/api-extensions.md
Original file line number Diff line number Diff line change
Expand Up @@ -2318,3 +2318,15 @@ Adds a new `start` field to the `POST /1.0/instances` API which when set
to `true` will have the instance automatically start upon creation.

In this scenario, the creation and startup is part of a single background operation.

## `clustering_evacuation_stop_options`

This introduces new options for the `cluster.evacuate` option:

* `stateful-stop` has the instance store its state to disk to be resume on restore.
* `force-stop` has the instance immediately stopped without waiting for it to shut down.

## `boot_host_shutdown_action`

This introduces a new `boot.host_shutdown_action` instance configuration key which can be used to override the default `stop` behavior on system shutdown.
It supports the value `stop`, `stateful-stop` and `force-stop`.
17 changes: 14 additions & 3 deletions doc/config_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,14 @@ The number of seconds to wait after the instance started before starting the nex
The instance with the highest value is started first.
```

```{config:option} boot.host_shutdown_action instance-boot
:defaultdesc: "stop"
:liveupdate: "yes"
:shortdesc: "What action to take on the instance when the host is shut down"
:type: "integer"
Action to take on host shut down
```

```{config:option} boot.host_shutdown_timeout instance-boot
:defaultdesc: "30"
:liveupdate: "yes"
Expand Down Expand Up @@ -176,12 +184,15 @@ Available Modes:
+ If any device is not suitable for migration, the instance will not be migrated (only stopped).
+ Live migration will be used only for virtual machines with the `migration.stateful` setting
enabled and for which all its devices can be migrated as well.
- `live-migrate`: Instances are live-migrated to another node. This means the instance remains running
- `live-migrate`: Instances are live-migrated to another server. This means the instance remains running
and operational during the migration process, ensuring minimal disruption.
- `migrate`: In this mode, instances are migrated to another node in the cluster. The migration
- `migrate`: In this mode, instances are migrated to another server in the cluster. The migration
process will not be live, meaning there will be a brief downtime for the instance during the
migration.
- `stop`: Instances are not migrated. Instead, they are stopped on the current node.
- `stop`: Instances are not migrated. Instead, they are stopped on the current server.
- `stateful-stop`: Instances are not migrated. Instead, they are stopped on the current server
but with their runtime state (memory) stored on disk for resuming on restore.
- `force-stop`: Instances are not migrated. Instead, they are forcefully stopped.

See {ref}`cluster-evacuate` for more information.
```
Expand Down
20 changes: 16 additions & 4 deletions internal/instance/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,15 @@ var InstanceConfigKeysAny = map[string]func(value string) error{
// shortdesc: What order to shut down the instances in
"boot.stop.priority": validate.Optional(validate.IsInt64),

// gendoc:generate(entity=instance, group=boot, key=boot.host_shutdown_action)
// Action to take on host shut down
// ---
// type: integer
// defaultdesc: stop
// liveupdate: yes
// shortdesc: What action to take on the instance when the host is shut down
"boot.host_shutdown_action": validate.Optional(validate.IsOneOf("stop", "force-stop", "stateful-stop")),

// gendoc:generate(entity=instance, group=boot, key=boot.host_shutdown_timeout)
// Number of seconds to wait for the instance to shut down before it is force-stopped.
// ---
Expand Down Expand Up @@ -139,20 +148,23 @@ var InstanceConfigKeysAny = map[string]func(value string) error{
// + If any device is not suitable for migration, the instance will not be migrated (only stopped).
// + Live migration will be used only for virtual machines with the `migration.stateful` setting
// enabled and for which all its devices can be migrated as well.
// - `live-migrate`: Instances are live-migrated to another node. This means the instance remains running
// - `live-migrate`: Instances are live-migrated to another server. This means the instance remains running
// and operational during the migration process, ensuring minimal disruption.
// - `migrate`: In this mode, instances are migrated to another node in the cluster. The migration
// - `migrate`: In this mode, instances are migrated to another server in the cluster. The migration
// process will not be live, meaning there will be a brief downtime for the instance during the
// migration.
// - `stop`: Instances are not migrated. Instead, they are stopped on the current node.
// - `stop`: Instances are not migrated. Instead, they are stopped on the current server.
// - `stateful-stop`: Instances are not migrated. Instead, they are stopped on the current server
// but with their runtime state (memory) stored on disk for resuming on restore.
// - `force-stop`: Instances are not migrated. Instead, they are forcefully stopped.
//
// See {ref}`cluster-evacuate` for more information.
// ---
// type: string
// defaultdesc: `auto`
// liveupdate: no
// shortdesc: What to do when evacuating the instance
"cluster.evacuate": validate.Optional(validate.IsOneOf("auto", "migrate", "live-migrate", "stop")),
"cluster.evacuate": validate.Optional(validate.IsOneOf("auto", "migrate", "live-migrate", "stop", "stateful-stop", "force-stop")),

// gendoc:generate(entity=instance, group=resource-limits, key=limits.cpu)
// A number or a specific range of CPUs to expose to the instance.
Expand Down
30 changes: 10 additions & 20 deletions internal/server/instance/drivers/driver_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -914,27 +914,18 @@ func (d *common) warningsDelete() error {
return nil
}

// canMigrate determines if the given instance can be migrated and whether the migration
// can be live. In "auto" mode, the function checks each attached device of the instance
// to ensure they are all migratable.
func (d *common) canMigrate(inst instance.Instance) (bool, bool) {
// canMigrate determines if the given instance can be migrated and what kind of migration to attempt.
func (d *common) canMigrate(inst instance.Instance) string {
// Check policy for the instance.
config := d.ExpandedConfig()
val, ok := config["cluster.evacuate"]
if !ok {
val = "auto"
}

if val == "migrate" {
return true, false
}

if val == "live-migrate" {
return true, true
}

if val == "stop" {
return false, false
// If not using auto, just return the migration type.
if val != "auto" {
return val
}

// Look at attached devices.
Expand All @@ -944,23 +935,22 @@ func (d *common) canMigrate(inst instance.Instance) (bool, bool) {
dev, err := device.New(inst, d.state, deviceName, rawConfig, volatileGet, volatileSet)
if err != nil {
logger.Warn("Instance will not be migrated due to a device error", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "device": dev.Name(), "err": err})
return false, false
return "stop"
}

if !dev.CanMigrate() {
logger.Warn("Instance will not be migrated because its device cannot be migrated", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "device": dev.Name()})
return false, false
return "stop"
}
}

// Check if set up for live migration.
// Limit automatic live-migration to virtual machines for now.
live := false
if inst.Type() == instancetype.VM {
live = util.IsTrue(config["migration.stateful"])
if inst.Type() == instancetype.VM && util.IsTrue(config["migration.stateful"]) {
return "live-migrate"
}

return true, live
return "migrate"
}

// recordLastState records last power and used time into local config and database config.
Expand Down
Loading
Loading