From d740db9fe4b30ee67173997b614003047bbeb227 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 24 Jan 2024 19:17:20 -0500 Subject: [PATCH 01/16] internal/instance: Don't use the node terminology MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane Graber --- internal/instance/config.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/instance/config.go b/internal/instance/config.go index 518d1192c00..a352f211c27 100644 --- a/internal/instance/config.go +++ b/internal/instance/config.go @@ -139,12 +139,12 @@ var InstanceConfigKeysAny = map[string]func(value string) error{ // + If any device is not suitable for migration, the instance will not be migrated (only stopped). // + Live migration will be used only for virtual machines with the `migration.stateful` setting // enabled and for which all its devices can be migrated as well. - // - `live-migrate`: Instances are live-migrated to another node. This means the instance remains running + // - `live-migrate`: Instances are live-migrated to another server. This means the instance remains running // and operational during the migration process, ensuring minimal disruption. - // - `migrate`: In this mode, instances are migrated to another node in the cluster. The migration + // - `migrate`: In this mode, instances are migrated to another server in the cluster. The migration // process will not be live, meaning there will be a brief downtime for the instance during the // migration. - // - `stop`: Instances are not migrated. Instead, they are stopped on the current node. + // - `stop`: Instances are not migrated. Instead, they are stopped on the current server. // // See {ref}`cluster-evacuate` for more information. // --- From 7ae0cdcdab4200b59d68f6cc60f3abaf6ee094ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Thu, 25 Jan 2024 00:14:37 -0500 Subject: [PATCH 02/16] doc: Update configs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane Graber --- doc/config_options.txt | 6 +++--- internal/server/metadata/configuration.json | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/config_options.txt b/doc/config_options.txt index 67da63a8595..800dda10d7b 100644 --- a/doc/config_options.txt +++ b/doc/config_options.txt @@ -176,12 +176,12 @@ Available Modes: + If any device is not suitable for migration, the instance will not be migrated (only stopped). + Live migration will be used only for virtual machines with the `migration.stateful` setting enabled and for which all its devices can be migrated as well. - - `live-migrate`: Instances are live-migrated to another node. This means the instance remains running + - `live-migrate`: Instances are live-migrated to another server. This means the instance remains running and operational during the migration process, ensuring minimal disruption. - - `migrate`: In this mode, instances are migrated to another node in the cluster. The migration + - `migrate`: In this mode, instances are migrated to another server in the cluster. The migration process will not be live, meaning there will be a brief downtime for the instance during the migration. - - `stop`: Instances are not migrated. Instead, they are stopped on the current node. + - `stop`: Instances are not migrated. Instead, they are stopped on the current server. See {ref}`cluster-evacuate` for more information. ``` diff --git a/internal/server/metadata/configuration.json b/internal/server/metadata/configuration.json index 7dcfe0615a1..2f0b42d2800 100644 --- a/internal/server/metadata/configuration.json +++ b/internal/server/metadata/configuration.json @@ -194,7 +194,7 @@ "cluster.evacuate": { "defaultdesc": "`auto`", "liveupdate": "no", - "longdesc": "The `cluster.evacuate` provides control over how instances are handled when a cluster member is being\nevacuated.\n\nAvailable Modes:\n - `auto` *(default)*: The system will automatically decide the best evacuation method based on the\n instance's type and configured devices:\n + If any device is not suitable for migration, the instance will not be migrated (only stopped).\n + Live migration will be used only for virtual machines with the `migration.stateful` setting\n enabled and for which all its devices can be migrated as well.\n - `live-migrate`: Instances are live-migrated to another node. This means the instance remains running\n and operational during the migration process, ensuring minimal disruption.\n - `migrate`: In this mode, instances are migrated to another node in the cluster. The migration\n process will not be live, meaning there will be a brief downtime for the instance during the\n migration.\n - `stop`: Instances are not migrated. Instead, they are stopped on the current node.\n\nSee {ref}`cluster-evacuate` for more information.", + "longdesc": "The `cluster.evacuate` provides control over how instances are handled when a cluster member is being\nevacuated.\n\nAvailable Modes:\n - `auto` *(default)*: The system will automatically decide the best evacuation method based on the\n instance's type and configured devices:\n + If any device is not suitable for migration, the instance will not be migrated (only stopped).\n + Live migration will be used only for virtual machines with the `migration.stateful` setting\n enabled and for which all its devices can be migrated as well.\n - `live-migrate`: Instances are live-migrated to another server. This means the instance remains running\n and operational during the migration process, ensuring minimal disruption.\n - `migrate`: In this mode, instances are migrated to another server in the cluster. The migration\n process will not be live, meaning there will be a brief downtime for the instance during the\n migration.\n - `stop`: Instances are not migrated. Instead, they are stopped on the current server.\n\nSee {ref}`cluster-evacuate` for more information.", "shortdesc": "What to do when evacuating the instance", "type": "string" } From 47938943458ddca45309f8cbf18bd3bf8530bbf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 24 Jan 2024 19:24:31 -0500 Subject: [PATCH 03/16] api: clustering_evacuation_stop_options MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane Graber --- doc/api-extensions.md | 7 +++++++ internal/version/api.go | 1 + 2 files changed, 8 insertions(+) diff --git a/doc/api-extensions.md b/doc/api-extensions.md index 68444daa2cf..8773fc1fc20 100644 --- a/doc/api-extensions.md +++ b/doc/api-extensions.md @@ -2318,3 +2318,10 @@ Adds a new `start` field to the `POST /1.0/instances` API which when set to `true` will have the instance automatically start upon creation. In this scenario, the creation and startup is part of a single background operation. + +## `clustering_evacuation_stop_options` + +This introduces new options for the `cluster.evacuate` option: + +* `stateful-stop` has the instance store its state to disk to be resume on restore. +* `force-stop` has the instance immediately stopped without waiting for it to shut down. diff --git a/internal/version/api.go b/internal/version/api.go index 5e143624299..5b600751e0f 100644 --- a/internal/version/api.go +++ b/internal/version/api.go @@ -391,6 +391,7 @@ var APIExtensions = []string{ "disk_io_bus_virtio_blk", "loki_config_instance", "instance_create_start", + "clustering_evacuation_stop_options", } // APIExtensionsCount returns the number of available API extensions. From a7b5477ca4ac1e003554b3f13d09cc03e58dd5a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 24 Jan 2024 19:25:07 -0500 Subject: [PATCH 04/16] internal/instance: Extend cluster.evacuate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane Graber --- internal/instance/config.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/internal/instance/config.go b/internal/instance/config.go index a352f211c27..a63ae76d81f 100644 --- a/internal/instance/config.go +++ b/internal/instance/config.go @@ -145,6 +145,9 @@ var InstanceConfigKeysAny = map[string]func(value string) error{ // process will not be live, meaning there will be a brief downtime for the instance during the // migration. // - `stop`: Instances are not migrated. Instead, they are stopped on the current server. + // - `stateful-stop`: Instances are not migrated. Instead, they are stopped on the current server + // but with their runtime state (memory) stored on disk for resuming on restore. + // - `force-stop`: Instances are not migrated. Instead, they are forcefully stopped. // // See {ref}`cluster-evacuate` for more information. // --- @@ -152,7 +155,7 @@ var InstanceConfigKeysAny = map[string]func(value string) error{ // defaultdesc: `auto` // liveupdate: no // shortdesc: What to do when evacuating the instance - "cluster.evacuate": validate.Optional(validate.IsOneOf("auto", "migrate", "live-migrate", "stop")), + "cluster.evacuate": validate.Optional(validate.IsOneOf("auto", "migrate", "live-migrate", "stop", "stateful-stop", "force-stop")), // gendoc:generate(entity=instance, group=resource-limits, key=limits.cpu) // A number or a specific range of CPUs to expose to the instance. From d7f01f1a5aa7ccb5c026a2127d4ccc1c14f65ff5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 24 Jan 2024 20:22:08 -0500 Subject: [PATCH 05/16] incusd/cluster: Add evacuation mode validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane Graber --- cmd/incusd/api_cluster.go | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/cmd/incusd/api_cluster.go b/cmd/incusd/api_cluster.go index 6e72fe89a54..b355164fca0 100644 --- a/cmd/incusd/api_cluster.go +++ b/cmd/incusd/api_cluster.go @@ -2994,19 +2994,29 @@ func clusterNodeStatePost(d *Daemon, r *http.Request) response.Response { s := d.State() - // Forward request + // Forward request. resp := forwardedResponseToNode(s, r, name) if resp != nil { return resp } - // Parse the request + // Parse the request. req := api.ClusterMemberStatePost{} err = json.NewDecoder(r.Body).Decode(&req) if err != nil { return response.BadRequest(err) } + // Validate the overrides. + if req.Action == "evacuate" && req.Mode != "" { + // Use the validator from the instance logic. + validator := internalInstance.InstanceConfigKeysAny["cluster.evacuate"] + err = validator(req.Mode) + if err != nil { + return response.BadRequest(err) + } + } + if req.Action == "evacuate" { stopFunc := func(inst instance.Instance) error { l := logger.AddContext(logger.Ctx{"project": inst.Project().Name, "instance": inst.Name()}) From 75815ac4b58a0d6e9276b56565498370fefd1d27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 24 Jan 2024 20:22:47 -0500 Subject: [PATCH 06/16] incusd/instance: Use a string for CanMigrate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane Graber --- .../server/instance/drivers/driver_common.go | 30 +++++++------------ .../server/instance/drivers/driver_lxc.go | 2 +- .../server/instance/drivers/driver_qemu.go | 2 +- .../server/instance/instance_interface.go | 2 +- 4 files changed, 13 insertions(+), 23 deletions(-) diff --git a/internal/server/instance/drivers/driver_common.go b/internal/server/instance/drivers/driver_common.go index 5086d7815ab..0f8e12c9deb 100644 --- a/internal/server/instance/drivers/driver_common.go +++ b/internal/server/instance/drivers/driver_common.go @@ -914,10 +914,8 @@ func (d *common) warningsDelete() error { return nil } -// canMigrate determines if the given instance can be migrated and whether the migration -// can be live. In "auto" mode, the function checks each attached device of the instance -// to ensure they are all migratable. -func (d *common) canMigrate(inst instance.Instance) (bool, bool) { +// canMigrate determines if the given instance can be migrated and what kind of migration to attempt. +func (d *common) canMigrate(inst instance.Instance) string { // Check policy for the instance. config := d.ExpandedConfig() val, ok := config["cluster.evacuate"] @@ -925,16 +923,9 @@ func (d *common) canMigrate(inst instance.Instance) (bool, bool) { val = "auto" } - if val == "migrate" { - return true, false - } - - if val == "live-migrate" { - return true, true - } - - if val == "stop" { - return false, false + // If not using auto, just return the migration type. + if val != "auto" { + return val } // Look at attached devices. @@ -944,23 +935,22 @@ func (d *common) canMigrate(inst instance.Instance) (bool, bool) { dev, err := device.New(inst, d.state, deviceName, rawConfig, volatileGet, volatileSet) if err != nil { logger.Warn("Instance will not be migrated due to a device error", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "device": dev.Name(), "err": err}) - return false, false + return "stop" } if !dev.CanMigrate() { logger.Warn("Instance will not be migrated because its device cannot be migrated", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "device": dev.Name()}) - return false, false + return "stop" } } // Check if set up for live migration. // Limit automatic live-migration to virtual machines for now. - live := false - if inst.Type() == instancetype.VM { - live = util.IsTrue(config["migration.stateful"]) + if inst.Type() == instancetype.VM && util.IsTrue(config["migration.stateful"]) { + return "live-migrate" } - return true, live + return "migrate" } // recordLastState records last power and used time into local config and database config. diff --git a/internal/server/instance/drivers/driver_lxc.go b/internal/server/instance/drivers/driver_lxc.go index 1c580d56784..0a96c614960 100644 --- a/internal/server/instance/drivers/driver_lxc.go +++ b/internal/server/instance/drivers/driver_lxc.go @@ -8038,7 +8038,7 @@ func (d *lxc) IsRunning() bool { } // CanMigrate returns whether the instance can be migrated. -func (d *lxc) CanMigrate() (bool, bool) { +func (d *lxc) CanMigrate() string { return d.canMigrate(d) } diff --git a/internal/server/instance/drivers/driver_qemu.go b/internal/server/instance/drivers/driver_qemu.go index 4827341b5ce..d63ef74dc83 100644 --- a/internal/server/instance/drivers/driver_qemu.go +++ b/internal/server/instance/drivers/driver_qemu.go @@ -7657,7 +7657,7 @@ func (d *qemu) IsFrozen() bool { } // CanMigrate returns whether the instance can be migrated. -func (d *qemu) CanMigrate() (bool, bool) { +func (d *qemu) CanMigrate() string { return d.canMigrate(d) } diff --git a/internal/server/instance/instance_interface.go b/internal/server/instance/instance_interface.go index d2c467119f2..6046e13b498 100644 --- a/internal/server/instance/instance_interface.go +++ b/internal/server/instance/instance_interface.go @@ -157,7 +157,7 @@ type Instance interface { StoragePool() (string, error) // Migration. - CanMigrate() (bool, bool) + CanMigrate() string MigrateSend(args MigrateSendArgs) error MigrateReceive(args MigrateReceiveArgs) error From e46c8089e3aa3b4c322352d1037b4df6aa5fdd09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 24 Jan 2024 20:29:07 -0500 Subject: [PATCH 07/16] incusd/cluster: Update for CanMigrate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane Graber --- cmd/incusd/api_cluster.go | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/cmd/incusd/api_cluster.go b/cmd/incusd/api_cluster.go index b355164fca0..09473374f19 100644 --- a/cmd/incusd/api_cluster.go +++ b/cmd/incusd/api_cluster.go @@ -3293,25 +3293,16 @@ func evacuateInstances(ctx context.Context, opts evacuateOpts) error { l := logger.AddContext(logger.Ctx{"project": instProject.Name, "instance": inst.Name()}) // Check if migratable. - migrate, live := inst.CanMigrate() + action := inst.CanMigrate() // Apply overrides. - if opts.mode != "" { - if opts.mode == "stop" { - migrate = false - live = false - } else if opts.mode == "migrate" { - migrate = true - live = false - } else if opts.mode == "live-migrate" { - migrate = true - live = true - } + if opts.mode != "" && opts.mode != "auto" { + action = opts.mode } // Stop the instance if needed. isRunning := inst.IsRunning() - if opts.stopInstance != nil && isRunning && !(migrate && live) { + if opts.stopInstance != nil && isRunning && action == "stop" { metadata["evacuation_progress"] = fmt.Sprintf("Stopping %q in project %q", inst.Name(), instProject.Name) _ = opts.op.UpdateMetadata(metadata) @@ -3319,10 +3310,8 @@ func evacuateInstances(ctx context.Context, opts evacuateOpts) error { if err != nil { return err } - } - // If not migratable, the instance is just stopped. - if !migrate { + // Done with this instance. continue } @@ -3364,7 +3353,7 @@ func evacuateInstances(ctx context.Context, opts evacuateOpts) error { } start := isRunning || instanceShouldAutoStart(inst) - err = opts.migrateInstance(opts.s, opts.r, inst, targetMemberInfo, live, start, metadata, opts.op) + err = opts.migrateInstance(opts.s, opts.r, inst, targetMemberInfo, action == "live-migrate", start, metadata, opts.op) if err != nil { return err } @@ -3465,8 +3454,8 @@ func restoreClusterMember(d *Daemon, r *http.Request) response.Response { for _, inst := range instances { l := logger.AddContext(logger.Ctx{"project": inst.Project().Name, "instance": inst.Name()}) - // Check if live-migratable. - _, live := inst.CanMigrate() + // Check the action. + live := inst.CanMigrate() == "live-migrate" metadata["evacuation_progress"] = fmt.Sprintf("Migrating %q in project %q from %q", inst.Name(), inst.Project().Name, inst.Location()) _ = op.UpdateMetadata(metadata) From d8494319892f1e928ceeecbb6a6c1109387f1f10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 24 Jan 2024 20:59:50 -0500 Subject: [PATCH 08/16] incusd/cluster: Add stateful-stop and force-stop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane Graber --- cmd/incusd/api_cluster.go | 65 ++++++++++++++++++++++++++------------- 1 file changed, 44 insertions(+), 21 deletions(-) diff --git a/cmd/incusd/api_cluster.go b/cmd/incusd/api_cluster.go index 09473374f19..ece33225fbf 100644 --- a/cmd/incusd/api_cluster.go +++ b/cmd/incusd/api_cluster.go @@ -55,7 +55,7 @@ import ( "github.com/lxc/incus/shared/validate" ) -type evacuateStopFunc func(inst instance.Instance) error +type evacuateStopFunc func(inst instance.Instance, action string) error type evacuateMigrateFunc func(s *state.State, r *http.Request, inst instance.Instance, targetMemberInfo *db.NodeInfo, live bool, startInstance bool, metadata map[string]any, op *operations.Operation) error type evacuateOpts struct { @@ -3018,25 +3018,39 @@ func clusterNodeStatePost(d *Daemon, r *http.Request) response.Response { } if req.Action == "evacuate" { - stopFunc := func(inst instance.Instance) error { + stopFunc := func(inst instance.Instance, action string) error { l := logger.AddContext(logger.Ctx{"project": inst.Project().Name, "instance": inst.Name()}) - // Get the shutdown timeout for the instance. - timeout := inst.ExpandedConfig()["boot.host_shutdown_timeout"] - val, err := strconv.Atoi(timeout) - if err != nil { - val = evacuateHostShutdownDefaultTimeout - } - - // Start with a clean shutdown. - err = inst.Shutdown(time.Duration(val) * time.Second) - if err != nil { - l.Warn("Failed shutting down instance, forcing stop", logger.Ctx{"err": err}) - - // Fallback to forced stop. + if action == "force-stop" { + // Handle forced shutdown. err = inst.Stop(false) if err != nil && !errors.Is(err, instanceDrivers.ErrInstanceIsStopped) { - return fmt.Errorf("Failed to stop instance %q in project %q: %w", inst.Name(), inst.Project().Name, err) + return fmt.Errorf("Failed to force stop instance %q in project %q: %w", inst.Name(), inst.Project().Name, err) + } + } else if action == "stateful-stop" { + // Handle stateful stop. + err = inst.Stop(true) + if err != nil && !errors.Is(err, instanceDrivers.ErrInstanceIsStopped) { + return fmt.Errorf("Failed to stateful stop instance %q in project %q: %w", inst.Name(), inst.Project().Name, err) + } + } else { + // Get the shutdown timeout for the instance. + timeout := inst.ExpandedConfig()["boot.host_shutdown_timeout"] + val, err := strconv.Atoi(timeout) + if err != nil { + val = evacuateHostShutdownDefaultTimeout + } + + // Start with a clean shutdown. + err = inst.Shutdown(time.Duration(val) * time.Second) + if err != nil { + l.Warn("Failed shutting down instance, forcing stop", logger.Ctx{"err": err}) + + // Fallback to forced stop. + err = inst.Stop(false) + if err != nil && !errors.Is(err, instanceDrivers.ErrInstanceIsStopped) { + return fmt.Errorf("Failed to stop instance %q in project %q: %w", inst.Name(), inst.Project().Name, err) + } } } @@ -3302,17 +3316,19 @@ func evacuateInstances(ctx context.Context, opts evacuateOpts) error { // Stop the instance if needed. isRunning := inst.IsRunning() - if opts.stopInstance != nil && isRunning && action == "stop" { + if opts.stopInstance != nil && isRunning && action != "live-migrate" { metadata["evacuation_progress"] = fmt.Sprintf("Stopping %q in project %q", inst.Name(), instProject.Name) _ = opts.op.UpdateMetadata(metadata) - err := opts.stopInstance(inst) + err := opts.stopInstance(inst, action) if err != nil { return err } - // Done with this instance. - continue + if action != "migrate" { + // Done with this instance. + continue + } } // Get candidate cluster members to move instances to. @@ -3444,7 +3460,14 @@ func restoreClusterMember(d *Daemon, r *http.Request) response.Response { metadata["evacuation_progress"] = fmt.Sprintf("Starting %q in project %q", inst.Name(), inst.Project().Name) _ = op.UpdateMetadata(metadata) - err = inst.Start(false) + // If configured for stateful stop, try restoring its state. + action := inst.CanMigrate() + if action == "stateful-stop" { + err = inst.Start(true) + } else { + err = inst.Start(false) + } + if err != nil { return fmt.Errorf("Failed to start instance %q: %w", inst.Name(), err) } From 79cb8cdea516c42df47845bd3cd7f4e02ba44ca7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 24 Jan 2024 23:06:16 -0500 Subject: [PATCH 09/16] doc: Update configs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane Graber --- doc/config_options.txt | 3 +++ internal/server/metadata/configuration.json | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/config_options.txt b/doc/config_options.txt index 800dda10d7b..b3a89ca06ce 100644 --- a/doc/config_options.txt +++ b/doc/config_options.txt @@ -182,6 +182,9 @@ Available Modes: process will not be live, meaning there will be a brief downtime for the instance during the migration. - `stop`: Instances are not migrated. Instead, they are stopped on the current server. + - `stateful-stop`: Instances are not migrated. Instead, they are stopped on the current server + but with their runtime state (memory) stored on disk for resuming on restore. + - `force-stop`: Instances are not migrated. Instead, they are forcefully stopped. See {ref}`cluster-evacuate` for more information. ``` diff --git a/internal/server/metadata/configuration.json b/internal/server/metadata/configuration.json index 2f0b42d2800..6dbfc342638 100644 --- a/internal/server/metadata/configuration.json +++ b/internal/server/metadata/configuration.json @@ -194,7 +194,7 @@ "cluster.evacuate": { "defaultdesc": "`auto`", "liveupdate": "no", - "longdesc": "The `cluster.evacuate` provides control over how instances are handled when a cluster member is being\nevacuated.\n\nAvailable Modes:\n - `auto` *(default)*: The system will automatically decide the best evacuation method based on the\n instance's type and configured devices:\n + If any device is not suitable for migration, the instance will not be migrated (only stopped).\n + Live migration will be used only for virtual machines with the `migration.stateful` setting\n enabled and for which all its devices can be migrated as well.\n - `live-migrate`: Instances are live-migrated to another server. This means the instance remains running\n and operational during the migration process, ensuring minimal disruption.\n - `migrate`: In this mode, instances are migrated to another server in the cluster. The migration\n process will not be live, meaning there will be a brief downtime for the instance during the\n migration.\n - `stop`: Instances are not migrated. Instead, they are stopped on the current server.\n\nSee {ref}`cluster-evacuate` for more information.", + "longdesc": "The `cluster.evacuate` provides control over how instances are handled when a cluster member is being\nevacuated.\n\nAvailable Modes:\n - `auto` *(default)*: The system will automatically decide the best evacuation method based on the\n instance's type and configured devices:\n + If any device is not suitable for migration, the instance will not be migrated (only stopped).\n + Live migration will be used only for virtual machines with the `migration.stateful` setting\n enabled and for which all its devices can be migrated as well.\n - `live-migrate`: Instances are live-migrated to another server. This means the instance remains running\n and operational during the migration process, ensuring minimal disruption.\n - `migrate`: In this mode, instances are migrated to another server in the cluster. The migration\n process will not be live, meaning there will be a brief downtime for the instance during the\n migration.\n - `stop`: Instances are not migrated. Instead, they are stopped on the current server.\n - `stateful-stop`: Instances are not migrated. Instead, they are stopped on the current server\n but with their runtime state (memory) stored on disk for resuming on restore.\n - `force-stop`: Instances are not migrated. Instead, they are forcefully stopped.\n\nSee {ref}`cluster-evacuate` for more information.", "shortdesc": "What to do when evacuating the instance", "type": "string" } From 040ead1389163c86a2a448d19779ecc98f18d629 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 24 Jan 2024 23:05:27 -0500 Subject: [PATCH 10/16] api: boot_host_shutdown_action MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane Graber --- doc/api-extensions.md | 5 +++++ internal/version/api.go | 1 + 2 files changed, 6 insertions(+) diff --git a/doc/api-extensions.md b/doc/api-extensions.md index 8773fc1fc20..beba9665af9 100644 --- a/doc/api-extensions.md +++ b/doc/api-extensions.md @@ -2325,3 +2325,8 @@ This introduces new options for the `cluster.evacuate` option: * `stateful-stop` has the instance store its state to disk to be resume on restore. * `force-stop` has the instance immediately stopped without waiting for it to shut down. + +## `boot_host_shutdown_action` + +This introduces a new `boot.host_shutdown_action` instance configuration key which can be used to override the default `stop` behavior on system shutdown. +It supports the value `stop`, `stateful-stop` and `force-stop`. diff --git a/internal/version/api.go b/internal/version/api.go index 5b600751e0f..48ac32c4fe4 100644 --- a/internal/version/api.go +++ b/internal/version/api.go @@ -392,6 +392,7 @@ var APIExtensions = []string{ "loki_config_instance", "instance_create_start", "clustering_evacuation_stop_options", + "boot_host_shutdown_action", } // APIExtensionsCount returns the number of available API extensions. From 8fe56fbb6c76d8b8efa66e73bda48c7d757a1e16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 24 Jan 2024 23:12:33 -0500 Subject: [PATCH 11/16] internal/instance: Add boot.host_shutdown_action MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane Graber --- internal/instance/config.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/internal/instance/config.go b/internal/instance/config.go index a63ae76d81f..459d4c303e2 100644 --- a/internal/instance/config.go +++ b/internal/instance/config.go @@ -63,6 +63,15 @@ var InstanceConfigKeysAny = map[string]func(value string) error{ // shortdesc: What order to shut down the instances in "boot.stop.priority": validate.Optional(validate.IsInt64), + // gendoc:generate(entity=instance, group=boot, key=boot.host_shutdown_action) + // Action to take on host shut down + // --- + // type: integer + // defaultdesc: stop + // liveupdate: yes + // shortdesc: What action to take on the instance when the host is shut down + "boot.host_shutdown_action": validate.Optional(validate.IsOneOf("stop", "force-stop", "stateful-stop")), + // gendoc:generate(entity=instance, group=boot, key=boot.host_shutdown_timeout) // Number of seconds to wait for the instance to shut down before it is force-stopped. // --- From b72b5830d30d236b4253e6b5833f01aadd560914 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 24 Jan 2024 23:12:58 -0500 Subject: [PATCH 12/16] doc: Update configs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane Graber --- doc/config_options.txt | 8 ++++++++ internal/server/metadata/configuration.json | 9 +++++++++ 2 files changed, 17 insertions(+) diff --git a/doc/config_options.txt b/doc/config_options.txt index b3a89ca06ce..720ddfd908a 100644 --- a/doc/config_options.txt +++ b/doc/config_options.txt @@ -40,6 +40,14 @@ The number of seconds to wait after the instance started before starting the nex The instance with the highest value is started first. ``` +```{config:option} boot.host_shutdown_action instance-boot +:defaultdesc: "stop" +:liveupdate: "yes" +:shortdesc: "What action to take on the instance when the host is shut down" +:type: "integer" +Action to take on host shut down +``` + ```{config:option} boot.host_shutdown_timeout instance-boot :defaultdesc: "30" :liveupdate: "yes" diff --git a/internal/server/metadata/configuration.json b/internal/server/metadata/configuration.json index 6dbfc342638..58e52c4779f 100644 --- a/internal/server/metadata/configuration.json +++ b/internal/server/metadata/configuration.json @@ -50,6 +50,15 @@ "type": "integer" } }, + { + "boot.host_shutdown_action": { + "defaultdesc": "stop", + "liveupdate": "yes", + "longdesc": "Action to take on host shut down", + "shortdesc": "What action to take on the instance when the host is shut down", + "type": "integer" + } + }, { "boot.host_shutdown_timeout": { "defaultdesc": "30", From 210ac67a0b1343f79fa0e16cdc7344bf498da72c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 24 Jan 2024 23:14:28 -0500 Subject: [PATCH 13/16] scripts/bash: Add boot.host_shutdown_action MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane Graber --- scripts/bash/incus | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/bash/incus b/scripts/bash/incus index 049659b88a0..0cda3e26391 100644 --- a/scripts/bash/incus +++ b/scripts/bash/incus @@ -90,6 +90,7 @@ _have incus && { container_keys="boot.autostart boot.autostart.delay \ boot.autostart.priority boot.stop.priority \ + boot.host_shutdown_action \ boot.host_shutdown_timeout environment. \ limits.cpu limits.cpu.allowance limits.cpu.priority \ limits.disk.priority limits.memory limits.memory.enforce \ From 50aad2c40e7febe5292dba78be51c73e6fc8e524 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 24 Jan 2024 23:15:22 -0500 Subject: [PATCH 14/16] incusd/project: Add boot.host_shutdown_action MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane Graber --- internal/server/project/permissions.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/server/project/permissions.go b/internal/server/project/permissions.go index b6b49c96235..16c1be2946d 100644 --- a/internal/server/project/permissions.go +++ b/internal/server/project/permissions.go @@ -828,6 +828,7 @@ func isContainerLowLevelOptionForbidden(key string) bool { } if util.ValueInSlice(key, []string{ + "boot.host_shutdown_action", "boot.host_shutdown_timeout", "linux.kernel_modules", "raw.apparmor", @@ -847,6 +848,7 @@ func isContainerLowLevelOptionForbidden(key string) bool { // Return true if a low-level VM option is forbidden. func isVMLowLevelOptionForbidden(key string) bool { return util.ValueInSlice(key, []string{ + "boot.host_shutdown_action", "boot.host_shutdown_timeout", "limits.memory.hugepages", "raw.idmap", From ac2555e02fe8cfbcccdd9d5cffb3e284e877d71d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 24 Jan 2024 23:20:10 -0500 Subject: [PATCH 15/16] incusd/instances: Add support for boot.host_shutdown_action MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane Graber --- cmd/incusd/instances.go | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/cmd/incusd/instances.go b/cmd/incusd/instances.go index fe58879fb36..1a083d6f22e 100644 --- a/cmd/incusd/instances.go +++ b/cmd/incusd/instances.go @@ -213,6 +213,7 @@ func instancesStart(s *state.State, instances []instance.Instance) { // Get the instance config. config := inst.ExpandedConfig() autoStartDelay := config["boot.autostart.delay"] + shutdownAction := config["boot.host_shutdown_action"] instLogger := logger.AddContext(logger.Ctx{"project": inst.Project().Name, "instance": inst.Name()}) @@ -220,7 +221,16 @@ func instancesStart(s *state.State, instances []instance.Instance) { var attempt = 0 for { attempt++ - err := inst.Start(false) + + var err error + if shutdownAction == "stateful-stop" { + // Attempt to restore state. + err = inst.Start(true) + } else { + // Normal startup. + err = inst.Start(false) + } + if err != nil { if api.StatusErrorCheck(err, http.StatusServiceUnavailable) { break // Don't log or retry instances that are not ready to start yet. @@ -379,13 +389,26 @@ func instancesShutdown(s *state.State, instances []instance.Instance) { timeoutSeconds, _ = strconv.Atoi(value) } - err := inst.Shutdown(time.Second * time.Duration(timeoutSeconds)) - if err != nil { - logger.Warn("Failed shutting down instance, forcefully stopping", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "err": err}) - err = inst.Stop(false) + action := inst.ExpandedConfig()["boot.host_shutdown_action"] + if action == "stateful-stop" { + err := inst.Stop(true) + if err != nil { + logger.Warn("Failed statefully stopping instance", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "err": err}) + } + } else if action == "force-stop" { + err := inst.Stop(false) if err != nil { logger.Warn("Failed forcefully stopping instance", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "err": err}) } + } else { + err := inst.Shutdown(time.Second * time.Duration(timeoutSeconds)) + if err != nil { + logger.Warn("Failed shutting down instance, forcefully stopping", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "err": err}) + err = inst.Stop(false) + if err != nil { + logger.Warn("Failed forcefully stopping instance", logger.Ctx{"project": inst.Project().Name, "instance": inst.Name(), "err": err}) + } + } } if inst.ID() > 0 { From bebcc75634d1807bc221f02eec59495bdfdd0229 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 24 Jan 2024 23:38:55 -0500 Subject: [PATCH 16/16] incusd/instance: Fallback to stateless start when no state available MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane Graber --- internal/server/instance/drivers/driver_lxc.go | 8 +------- internal/server/instance/drivers/driver_qemu.go | 11 +++++------ 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/internal/server/instance/drivers/driver_lxc.go b/internal/server/instance/drivers/driver_lxc.go index 0a96c614960..8cde1d20a30 100644 --- a/internal/server/instance/drivers/driver_lxc.go +++ b/internal/server/instance/drivers/driver_lxc.go @@ -2425,13 +2425,7 @@ func (d *lxc) Start(stateful bool) error { } // If stateful, restore now. - if stateful { - if !d.stateful { - err = fmt.Errorf("Instance has no existing state to restore") - op.Done(err) - return err - } - + if stateful && d.stateful { d.logger.Info("Restoring stateful checkpoint") criuMigrationArgs := instance.CriuMigrationArgs{ diff --git a/internal/server/instance/drivers/driver_qemu.go b/internal/server/instance/drivers/driver_qemu.go index d63ef74dc83..e3c00ff12b1 100644 --- a/internal/server/instance/drivers/driver_qemu.go +++ b/internal/server/instance/drivers/driver_qemu.go @@ -1438,13 +1438,12 @@ func (d *qemu) start(stateful bool, op *operationlock.InstanceOperation) error { // If stateful, restore now. if stateful { - if !d.stateful { - err = fmt.Errorf("Instance has no existing state to restore") - op.Done(err) - return err + if d.stateful { + qemuCmd = append(qemuCmd, "-incoming", "defer") + } else { + // No state to restore, just start as normal. + stateful = false } - - qemuCmd = append(qemuCmd, "-incoming", "defer") } else if d.stateful { // Stateless start requested but state is present, delete it. err := os.Remove(d.StatePath())