Skip to content

Commit 7ac5811

Browse files
committed
more fixes, including from latest round of testing
1 parent a788de4 commit 7ac5811

File tree

4 files changed

+108
-42
lines changed

4 files changed

+108
-42
lines changed

common/src/zpool_name.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,15 @@ impl ZpoolName {
120120

121121
/// Returns a path to a dataset's mountpoint within the zpool.
122122
///
123+
/// `dataset` is relative to the root of the pool. For example, if your
124+
/// pool is `oxp_123`, then your full dataset name would be something like
125+
/// `oxp_123/foo/bar`. You'd pass `foo/bar` to this function and it would
126+
/// return something like `/pool/ext/123/foo/bar`.
127+
///
123128
/// For example: oxp_(UUID) -> /pool/ext/(UUID)/(dataset)
129+
///
130+
/// In this example, the value of UUID here is implied by `self`. `dataset`
131+
/// is the argument to this function.
124132
pub fn dataset_mountpoint(
125133
&self,
126134
root: &Utf8Path,

illumos-utils/src/zfs.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1287,7 +1287,7 @@ impl Zfs {
12871287
/// Calls "zfs get" to acquire multiple values
12881288
///
12891289
/// - `names`: The properties being acquired
1290-
/// - `source`: The optioanl property source (origin of the property)
1290+
/// - `source`: The optional property source (origin of the property)
12911291
/// Defaults to "all sources" when unspecified.
12921292
pub async fn get_values<const N: usize>(
12931293
filesystem_name: &str,

sled-agent/config-reconciler/src/dump_setup.rs

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -658,20 +658,29 @@ impl DumpSetupWorker {
658658
zone_name,
659659
completion_tx,
660660
})) => {
661-
if let Err(error) = self
661+
match self
662662
.do_archive_former_zone_root(
663663
&zone_root,
664664
&zone_name,
665665
completion_tx,
666666
)
667667
.await
668668
{
669-
error!(
670-
self.log,
671-
"Failed to archive former zone root";
672-
"zone_root" => %zone_root,
673-
InlineErrorChain::new(&error),
674-
);
669+
Ok(()) => {
670+
info!(
671+
self.log,
672+
"Archived logs from former zone root";
673+
"zone_root" => %zone_root
674+
);
675+
}
676+
Err(error) => {
677+
error!(
678+
self.log,
679+
"Failed to archive former zone root";
680+
"zone_root" => %zone_root,
681+
InlineErrorChain::new(&error),
682+
);
683+
}
675684
}
676685
}
677686
Ok(None) => {

sled-agent/config-reconciler/src/reconciler_task/external_disks.rs

Lines changed: 83 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -466,11 +466,14 @@ impl ExternalDisks {
466466

467467
let mut newly_adopted = Vec::new();
468468
for disk_state in disk_states {
469-
let newly_adopted_disk = match self.disks.entry(disk_state.id()) {
469+
let disk_id = disk_state.id();
470+
let newly_adopted_disk = match self.disks.entry(disk_id) {
470471
Entry::Vacant(vacant) => {
471472
let new_state = vacant.insert(disk_state);
472473
match &new_state.state {
473-
DiskState::Managed(disk) => Some(*disk.zpool_name()),
474+
DiskState::Managed(disk) => {
475+
Some((disk_id, *disk.zpool_name()))
476+
}
474477
DiskState::FailedToManage(..) => None,
475478
}
476479
}
@@ -483,34 +486,70 @@ impl ExternalDisks {
483486
(
484487
DiskState::FailedToManage(_),
485488
DiskState::Managed(disk),
486-
) => Some(*disk.zpool_name()),
489+
) => Some((disk_id, *disk.zpool_name())),
487490
}
488491
}
489492
};
490493

491-
if let Some(disk_ref) = newly_adopted_disk {
492-
newly_adopted.push(disk_ref);
494+
if let Some(info) = newly_adopted_disk {
495+
newly_adopted.push(info);
493496
}
494497
}
495498

499+
// Update the output channels now. This is important to do before
500+
// cleaning up former zone root datasets because that step will require
501+
// that the archival task (DumpSetup) has seen the new disks and added
502+
// any debug datasets found on them.
496503
self.update_output_watch_channels();
497504

498-
// For any newly-adopted disks, attempt to archive and destroy any zone
499-
// root datasets that we find on them.
500-
for zpool_name in newly_adopted {
505+
// For any newly-adopted disks, clean up any former zone root datasets
506+
// that we find on them.
507+
let mut failed = Vec::new();
508+
for (disk_id, zpool_name) in newly_adopted {
501509
if let Err(error) = self
502510
.archive_and_destroy_former_zone_roots(&zpool_name, log)
503511
.await
504512
{
505-
// XXX-dap what if anything can we do about this?
513+
// This situation is really unfortunate. We adopted the disk,
514+
// but couldn't clean up its zone root. We now want to go back
515+
// and un-adopt it.
516+
//
517+
// You might ask: why didn't we do this step during adoption so
518+
// that we could have failed at that point? We can't: this
519+
// process (archival and cleanup) depends on having already
520+
// adopted some disks in order to use their debug datasets.
521+
//
522+
// The right long-term answer is to destroy these zone roots not
523+
// here, during adoption, but when starting zones. See
524+
// oxidecomputer/omicron#8316. This too is complicated.
525+
//
526+
// Fortunately, this case should be nearly impossible in
527+
// practice. But if we get here, mark the disk accordingly.
528+
let error = InlineErrorChain::new(&*error);
506529
error!(
507530
log,
508531
"failed to destroy former zone roots on pool";
509532
"pool" => %zpool_name,
510-
InlineErrorChain::new(&*error),
533+
&error,
511534
);
535+
failed.push((disk_id, error.to_string()));
512536
}
513537
}
538+
539+
// Attempt to un-adopt any disks that we failed to clean up.
540+
if !failed.is_empty() {
541+
for (disk_id, message) in failed {
542+
// unwrap(): We got these diskids from entries in the map
543+
// above.
544+
let mut disk = self.disks.get_mut(&disk_id).unwrap();
545+
*disk = ExternalDiskState::failed(
546+
disk.config.clone(),
547+
DiskManagementError::Other(message),
548+
);
549+
}
550+
551+
self.update_output_watch_channels();
552+
}
514553
}
515554

516555
async fn try_ensure_disk_managed<T: DiskAdopter>(
@@ -686,12 +725,11 @@ impl ExternalDisks {
686725
"Automatically archiving/wipe of dataset: {}",
687726
zone_dataset_name
688727
);
689-
archive_and_destroy_child_datasets(
728+
cleanup_former_zone_roots(
690729
log,
691730
mount_config,
692731
&self.archiver,
693732
&zpool_name,
694-
&zone_dataset_name,
695733
)
696734
.await?;
697735
Zfs::set_oxide_value(
@@ -794,14 +832,18 @@ impl DiskAdopter for RealDiskAdopter<'_> {
794832
}
795833
}
796834

797-
async fn archive_and_destroy_child_datasets(
835+
/// Given a pool name, find any zone root filesystems, attempt to archive their
836+
/// log files, and destroy them.
837+
async fn cleanup_former_zone_roots(
798838
log: &Logger,
799839
mount_config: &MountConfig,
800840
archiver: &FormerZoneRootArchiver,
801841
zpool_name: &ZpoolName,
802-
zone_dataset_name: &str,
803842
) -> Result<(), DiskManagementError> {
804-
let child_datasets = Zfs::list_datasets(zone_dataset_name)
843+
// Within each pool, ZONE_DATASET is the name of the dataset that's the
844+
// parent of all the zone root filesystems' datasets.
845+
let parent_dataset_name = format!("{}/{}", zpool_name, ZONE_DATASET);
846+
let child_datasets = Zfs::list_datasets(&parent_dataset_name)
805847
.await
806848
.context("listing datasets")
807849
.map_err(|error| {
@@ -810,31 +852,38 @@ async fn archive_and_destroy_child_datasets(
810852
)
811853
})?;
812854

813-
for child_dataset in child_datasets {
814-
let child_dataset = format!("{}/{}", zone_dataset_name, child_dataset);
815-
816-
// This is just a sanity check. There's no way this should ever be
817-
// wrong, but the risk of destroying the wrong dataset is just so high
818-
// that we want to be really sure.
819-
if !child_dataset.contains(ZONE_DATASET) {
820-
error!(
821-
log,
822-
"too afraid to delete dataset (this should be impossible)";
823-
"dataset_name" => child_dataset,
824-
);
825-
continue;
826-
}
855+
for child_name in child_datasets {
856+
// Determine the mountpoint of the child dataset.
857+
// `dataset_mountpoint()` expects a path relative to the root of the
858+
// pool. We could chop off the zpool_name from `parent_dataset_name`,
859+
// or (what we do here) construct the name we need directly.
860+
//
861+
// This works only because ZONE_DATASET itself is relative to the root
862+
// of the pool.
863+
let child_dataset_relative_to_pool =
864+
format!("{}/{}", ZONE_DATASET, child_name);
865+
let mountpoint = zpool_name.dataset_mountpoint(
866+
&mount_config.root,
867+
&child_dataset_relative_to_pool,
868+
);
827869

828-
// Attempt to archive it. This is best-effort.
829-
let mountpoint =
830-
zpool_name.dataset_mountpoint(&mount_config.root, &child_dataset);
870+
// Attempt to archive this zone as though it's a zone name.
871+
// This is best-effort.
831872
info!(
832873
log,
833-
"attempting to archive logs from path";
874+
"archiving logs from former zone root";
834875
"path" => %mountpoint
835876
);
836877
archiver.archive_former_zone_root(mountpoint).await;
837-
Zfs::destroy_dataset(&child_dataset).await.map_err(|error| {
878+
879+
let child_dataset_name =
880+
format!("{}/{}", parent_dataset_name, child_name);
881+
info!(
882+
log,
883+
"destroying former zone root";
884+
"dataset_name" => &child_dataset_name,
885+
);
886+
Zfs::destroy_dataset(&child_dataset_name).await.map_err(|error| {
838887
DiskManagementError::Other(
839888
InlineErrorChain::new(&error).to_string(),
840889
)

0 commit comments

Comments
 (0)