Skip to content

Commit 35a85cc

Browse files
committed
Implement new label format for large disks
This patch contains the logic for a new larger label format. This format is intended to support disks with large sector sizes. By using a larger label we can store more uberblocks and other critical pool metadata. We can also use the extra space to enable new features in ZFS going forwards. This initial commit does not add new capabilities, but provides the framework for them going forwards. Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com> Sponsored-by: Wasabi, Inc. Sponsored-by: Klara, Inc.
1 parent 5a8ba45 commit 35a85cc

37 files changed

+1514
-210
lines changed

cmd/zdb/zdb.c

Lines changed: 240 additions & 48 deletions
Large diffs are not rendered by default.

cmd/zhack.c

Lines changed: 276 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -500,11 +500,11 @@ zhack_do_feature(int argc, char **argv)
500500
return (0);
501501
}
502502

503-
#define ASHIFT_UBERBLOCK_SHIFT(ashift) \
503+
#define ASHIFT_UBERBLOCK_SHIFT(ashift, new) \
504504
MIN(MAX(ashift, UBERBLOCK_SHIFT), \
505-
MAX_UBERBLOCK_SHIFT)
506-
#define ASHIFT_UBERBLOCK_SIZE(ashift) \
507-
(1ULL << ASHIFT_UBERBLOCK_SHIFT(ashift))
505+
MAX_UBERBLOCK_SHIFT(new))
506+
#define ASHIFT_UBERBLOCK_SIZE(ashift, new) \
507+
(1ULL << ASHIFT_UBERBLOCK_SHIFT(ashift, new))
508508

509509
#define REPAIR_LABEL_STATUS_CKSUM (1 << 0)
510510
#define REPAIR_LABEL_STATUS_UB (1 << 1)
@@ -529,6 +529,26 @@ zhack_repair_read_label(const int fd, vdev_label_t *vl,
529529
return (0);
530530
}
531531

532+
static int
533+
zhack_repair_read(const int fd, uint8_t *buf, size_t buflen,
534+
const uint64_t offset, const int l)
535+
{
536+
const int err = pread64(fd, buf, buflen, offset);
537+
538+
if (err == -1) {
539+
(void) fprintf(stderr,
540+
"error: cannot read buffer at %lu for label %d: %s\n",
541+
offset, l, strerror(errno));
542+
return (err);
543+
} else if (err != buflen) {
544+
(void) fprintf(stderr,
545+
"error: bad read size at %lu for label %d \n", offset, l);
546+
return (err);
547+
}
548+
549+
return (0);
550+
}
551+
532552
static void
533553
zhack_repair_calc_cksum(const int byteswap, void *data, const uint64_t offset,
534554
const uint64_t abdsize, zio_eck_t *eck, zio_cksum_t *cksum)
@@ -691,7 +711,7 @@ zhack_repair_write_uberblock(vdev_label_t *vl, const int l,
691711
(char *)vl + offsetof(vdev_label_t, vl_uberblock);
692712
zio_eck_t *ub_eck =
693713
(zio_eck_t *)
694-
((char *)(ub_data) + (ASHIFT_UBERBLOCK_SIZE(ashift))) - 1;
714+
((char *)(ub_data) + (ASHIFT_UBERBLOCK_SIZE(ashift, B_FALSE))) - 1;
695715

696716
if (ub_eck->zec_magic != 0) {
697717
(void) fprintf(stderr,
@@ -710,10 +730,39 @@ zhack_repair_write_uberblock(vdev_label_t *vl, const int l,
710730
if (zhack_repair_write_label(l, fd, byteswap,
711731
ub_data, ub_eck,
712732
label_offset + offsetof(vdev_label_t, vl_uberblock),
713-
ASHIFT_UBERBLOCK_SIZE(ashift)))
733+
ASHIFT_UBERBLOCK_SIZE(ashift, B_FALSE)))
714734
labels_repaired[l] |= REPAIR_LABEL_STATUS_UB;
715735
}
716736

737+
static void
738+
zhack_repair_write_uberblock_new(void *ub_data, const int l,
739+
const uint64_t ashift, const int fd, const int byteswap,
740+
const uint64_t label_offset, uint32_t *labels_repaired)
741+
{
742+
zio_eck_t *ub_eck =
743+
(zio_eck_t *)
744+
((char *)(ub_data) + (ASHIFT_UBERBLOCK_SIZE(ashift, B_FALSE))) - 1;
745+
746+
if (ub_eck->zec_magic != 0) {
747+
(void) fprintf(stderr,
748+
"error: label %d: "
749+
"Expected Uberblock checksum magic number to "
750+
"be 0, but got %" PRIu64 "\n",
751+
l, ub_eck->zec_magic);
752+
(void) fprintf(stderr, "It would appear there's already "
753+
"a checksum for the uberblock.\n");
754+
return;
755+
}
756+
757+
758+
ub_eck->zec_magic = byteswap ? BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC;
759+
760+
if (zhack_repair_write_label(l, fd, byteswap,
761+
ub_data, ub_eck, label_offset + VDEV_LARGE_UBERBLOCK_RING,
762+
ASHIFT_UBERBLOCK_SIZE(ashift, B_TRUE)))
763+
labels_repaired[l] |= REPAIR_LABEL_STATUS_UB;
764+
}
765+
717766
static void
718767
zhack_repair_print_cksum(FILE *stream, const zio_cksum_t *cksum)
719768
{
@@ -727,12 +776,13 @@ zhack_repair_print_cksum(FILE *stream, const zio_cksum_t *cksum)
727776

728777
static int
729778
zhack_repair_test_cksum(const int byteswap, void *vdev_data,
730-
zio_eck_t *vdev_eck, const uint64_t vdev_phys_offset, const int l)
779+
const uint64_t size, zio_eck_t *vdev_eck, const uint64_t vdev_phys_offset,
780+
const int l)
731781
{
732782
const zio_cksum_t expected_cksum = vdev_eck->zec_cksum;
733783
zio_cksum_t actual_cksum;
734784
zhack_repair_calc_cksum(byteswap, vdev_data, vdev_phys_offset,
735-
VDEV_PHYS_SIZE, vdev_eck, &actual_cksum);
785+
size, vdev_eck, &actual_cksum);
736786
const uint64_t expected_magic = byteswap ?
737787
BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC;
738788
const uint64_t actual_magic = vdev_eck->zec_magic;
@@ -760,15 +810,17 @@ zhack_repair_test_cksum(const int byteswap, void *vdev_data,
760810

761811
static void
762812
zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
763-
vdev_label_t *vl, const uint64_t label_offset, const int l,
764-
uint32_t *labels_repaired)
813+
vdev_label_t *vl, const uint64_t filesize, const int l,
814+
uint32_t *labels_repaired, boolean_t *large_label)
765815
{
766816
ssize_t err;
767817
uberblock_t *ub = (uberblock_t *)vl->vl_uberblock;
768818
void *vdev_data =
769819
(char *)vl + offsetof(vdev_label_t, vl_vdev_phys);
770820
zio_eck_t *vdev_eck =
771821
(zio_eck_t *)((char *)(vdev_data) + VDEV_PHYS_SIZE) - 1;
822+
const uint64_t label_offset = vdev_label_offset(filesize, l, 0,
823+
B_FALSE);
772824
const uint64_t vdev_phys_offset =
773825
label_offset + offsetof(vdev_label_t, vl_vdev_phys);
774826
const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION,
@@ -802,8 +854,8 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
802854
}
803855

804856
if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 &&
805-
zhack_repair_test_cksum(byteswap, vdev_data, vdev_eck,
806-
vdev_phys_offset, l) != 0) {
857+
zhack_repair_test_cksum(byteswap, vdev_data, VDEV_PHYS_SIZE,
858+
vdev_eck, vdev_phys_offset, l) != 0) {
807859
(void) fprintf(stderr, "It would appear checksums are "
808860
"corrupted. Try zhack repair label -c <device>\n");
809861
return;
@@ -816,6 +868,8 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
816868
"error: cannot unpack nvlist label %d\n", l);
817869
return;
818870
}
871+
(void) nvlist_lookup_boolean_value(cfg, ZPOOL_CONFIG_LARGE_LABEL,
872+
large_label);
819873

820874
err = zhack_repair_check_label(ub,
821875
l, cfg_keys, ARRAY_SIZE(cfg_keys), cfg, vdev_tree_cfg, &ashift);
@@ -840,13 +894,212 @@ zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
840894

841895
zhack_repair_write_uberblock(vl,
842896
l, ashift, fd, byteswap, label_offset, labels_repaired);
897+
if (large_label) {
898+
zhack_repair_write_uberblock_new(ub, l, ashift,
899+
fd, byteswap, vdev_label_offset(filesize, l, 0,
900+
B_TRUE), labels_repaired);
901+
}
843902
}
844903

845904
if (zhack_repair_write_label(l, fd, byteswap, vdev_data, vdev_eck,
846905
vdev_phys_offset, VDEV_PHYS_SIZE))
847-
labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
906+
labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
907+
908+
fsync(fd);
909+
}
910+
911+
static void
912+
zhack_repair_one_label_large(const zhack_repair_op_t op, const int fd,
913+
const uint64_t label_offset, const int l, uint32_t *labels_repaired)
914+
{
915+
ssize_t err;
916+
void *toc_data = NULL, *bootenv = NULL, *vdev_config = NULL;
917+
void *spa_config = NULL, *ub = NULL;
918+
/*
919+
* Note that currently, this can't handle disks with larger than 8k
920+
* sector sizes. That needs to be fixed eventually.
921+
*/
922+
toc_data = malloc(VDEV_TOC_SIZE);
923+
err = zhack_repair_read(fd, toc_data, VDEV_TOC_SIZE, label_offset, l);
924+
if (err)
925+
goto out;
926+
927+
zio_eck_t *toc_eck = (zio_eck_t *)(toc_data + VDEV_TOC_SIZE) - 1;
928+
if (toc_eck->zec_magic == 0) {
929+
(void) fprintf(stderr, "error: label %d: "
930+
"Expected the nvlist checksum magic number to not be zero"
931+
"\n",
932+
l);
933+
(void) fprintf(stderr, "There should already be a checksum "
934+
"for the label.\n");
935+
goto out;
936+
}
937+
938+
int byteswap =
939+
(toc_eck->zec_magic == BSWAP_64((uint64_t)ZEC_MAGIC));
940+
941+
if (byteswap) {
942+
byteswap_uint64_array(&toc_eck->zec_cksum,
943+
sizeof (zio_cksum_t));
944+
toc_eck->zec_magic = BSWAP_64(toc_eck->zec_magic);
945+
}
946+
if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 &&
947+
zhack_repair_test_cksum(byteswap, toc_data, VDEV_TOC_SIZE,
948+
toc_eck, label_offset, l) != 0) {
949+
(void) fprintf(stderr, "It would appear checksums are "
950+
"corrupted. Try zhack repair label -c <device>\n");
951+
goto out;
952+
}
953+
954+
nvlist_t *toc;
955+
err = nvlist_unpack(toc_data, VDEV_TOC_SIZE, &toc, 0);
956+
if (err) {
957+
(void) fprintf(stderr,
958+
"error: cannot unpack nvlist TOC %d\n", l);
959+
goto out;
960+
}
961+
962+
uint32_t bootenv_size, vc_size, sc_size;
963+
if ((err = nvlist_lookup_uint32(toc, VDEV_TOC_BOOT_REGION,
964+
&bootenv_size)) || (err = nvlist_lookup_uint32(toc,
965+
VDEV_TOC_VDEV_CONFIG, &vc_size)) || (err = nvlist_lookup_uint32(toc,
966+
VDEV_TOC_POOL_CONFIG, &sc_size))) {
967+
(void) fprintf(stderr,
968+
"error: TOC missing core fields %d\n", l);
969+
goto out;
970+
}
971+
bootenv = malloc(bootenv_size);
972+
zio_eck_t *bootenv_eck = (zio_eck_t *)(bootenv + bootenv_size) - 1;
973+
vdev_config = malloc(vc_size);
974+
zio_eck_t *vc_eck = (zio_eck_t *)(vdev_config + vc_size) - 1;
975+
spa_config = malloc(sc_size);
976+
zio_eck_t *sc_eck = (zio_eck_t *)(spa_config + sc_size) - 1;
977+
978+
uint64_t offset = label_offset + VDEV_TOC_SIZE;
979+
if (bootenv_size != 0) {
980+
if ((err = zhack_repair_read(fd, bootenv,
981+
bootenv_size, offset, l)))
982+
goto out;
983+
if (byteswap) {
984+
byteswap_uint64_array(&bootenv_eck->zec_cksum,
985+
sizeof (zio_cksum_t));
986+
bootenv_eck->zec_magic =
987+
BSWAP_64(bootenv_eck->zec_magic);
988+
}
989+
if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 &&
990+
zhack_repair_test_cksum(byteswap, bootenv, bootenv_size,
991+
bootenv_eck, offset, l) != 0) {
992+
(void) fprintf(stderr, "It would appear checksums are "
993+
"corrupted. Try zhack repair label -c <device>\n");
994+
goto out;
995+
}
996+
}
997+
998+
offset += bootenv_size;
999+
if ((err = zhack_repair_read(fd, vdev_config, vc_size, offset, l)))
1000+
goto out;
1001+
1002+
if (byteswap) {
1003+
byteswap_uint64_array(&sc_eck->zec_cksum,
1004+
sizeof (zio_cksum_t));
1005+
vc_eck->zec_magic = BSWAP_64(vc_eck->zec_magic);
1006+
}
1007+
if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 &&
1008+
zhack_repair_test_cksum(byteswap, vdev_config, vc_size,
1009+
vc_eck, offset, l) != 0) {
1010+
(void) fprintf(stderr, "It would appear checksums are "
1011+
"corrupted. Try zhack repair label -c <device>\n");
1012+
goto out;
1013+
}
1014+
offset += vc_size;
1015+
if ((err = zhack_repair_read(fd, spa_config, sc_size, offset, l)))
1016+
goto out;
1017+
1018+
if (byteswap) {
1019+
byteswap_uint64_array(&sc_eck->zec_cksum,
1020+
sizeof (zio_cksum_t));
1021+
vc_eck->zec_magic = BSWAP_64(sc_eck->zec_magic);
1022+
}
1023+
if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 &&
1024+
zhack_repair_test_cksum(byteswap, spa_config, sc_size,
1025+
sc_eck, offset, l) != 0) {
1026+
(void) fprintf(stderr, "It would appear checksums are "
1027+
"corrupted. Try zhack repair label -c <device>\n");
1028+
goto out;
1029+
}
1030+
1031+
nvlist_t *cfg;
1032+
err = nvlist_unpack(vdev_config, vc_size - sizeof (zio_eck_t), &cfg, 0);
1033+
if (err) {
1034+
(void) fprintf(stderr,
1035+
"error: cannot unpack nvlist label %d\n", l);
1036+
return;
1037+
}
1038+
1039+
ub = malloc(UBERBLOCK_SHIFT);
1040+
err = zhack_repair_read(fd, ub, UBERBLOCK_SHIFT,
1041+
label_offset + VDEV_LARGE_UBERBLOCK_RING, l);
1042+
if (err)
1043+
goto out;
1044+
1045+
const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION,
1046+
ZPOOL_CONFIG_POOL_STATE, ZPOOL_CONFIG_GUID };
1047+
nvlist_t *vdev_tree_cfg = NULL;
1048+
uint64_t ashift;
1049+
err = zhack_repair_check_label(ub, l, cfg_keys, ARRAY_SIZE(cfg_keys),
1050+
cfg, vdev_tree_cfg, &ashift);
1051+
if (err)
1052+
return;
1053+
1054+
if ((op & ZHACK_REPAIR_OP_UNDETACH) != 0) {
1055+
char *buf;
1056+
size_t buflen;
1057+
1058+
err = zhack_repair_undetach(ub, cfg, l);
1059+
if (err)
1060+
return;
1061+
1062+
buf = vdev_config;
1063+
buflen = vc_size - sizeof (zio_eck_t);
1064+
if (nvlist_pack(cfg, &buf, &buflen, NV_ENCODE_XDR, 0) != 0) {
1065+
(void) fprintf(stderr,
1066+
"error: label %d: Failed to pack nvlist\n", l);
1067+
return;
1068+
}
1069+
1070+
zhack_repair_write_uberblock_new(ub, l, ashift, fd, byteswap,
1071+
label_offset, labels_repaired);
1072+
}
1073+
1074+
offset = label_offset;
1075+
if (zhack_repair_write_label(l, fd, byteswap, toc_data, toc_eck,
1076+
offset, VDEV_TOC_SIZE))
1077+
labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
1078+
offset += VDEV_TOC_SIZE;
1079+
if (zhack_repair_write_label(l, fd, byteswap, bootenv, bootenv_eck,
1080+
offset, bootenv_size))
1081+
labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
1082+
offset += bootenv_size;
1083+
if (zhack_repair_write_label(l, fd, byteswap, vdev_config, vc_eck,
1084+
offset, vc_size))
1085+
labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
1086+
offset += vc_size;
1087+
if (zhack_repair_write_label(l, fd, byteswap, spa_config, sc_eck,
1088+
offset, sc_size))
1089+
labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
8481090

8491091
fsync(fd);
1092+
out:
1093+
if (toc_data)
1094+
free(toc_data);
1095+
if (bootenv)
1096+
free(bootenv);
1097+
if (vdev_config)
1098+
free(vdev_config);
1099+
if (spa_config)
1100+
free(spa_config);
1101+
if (ub)
1102+
free(ub);
8501103
}
8511104

8521105
static const char *
@@ -889,9 +1142,18 @@ zhack_label_repair(const zhack_repair_op_t op, const int argc, char **argv)
8891142
filesize =
8901143
(filesize / sizeof (vdev_label_t)) * sizeof (vdev_label_t);
8911144

1145+
boolean_t large_label = B_FALSE;
8921146
for (int l = 0; l < VDEV_LABELS; l++) {
8931147
zhack_repair_one_label(op, fd, &labels[l],
894-
vdev_label_offset(filesize, l, 0), l, labels_repaired);
1148+
filesize, l, labels_repaired, &large_label);
1149+
if (large_label)
1150+
break;
1151+
}
1152+
if (large_label) {
1153+
for (int l = 0; l < VDEV_LABELS; l++) {
1154+
zhack_repair_one_label_large(op, fd,
1155+
filesize, l, labels_repaired);
1156+
}
8951157
}
8961158

8971159
close(fd);

0 commit comments

Comments
 (0)