Skip to content

Commit

Permalink
zcommon: Refactor FPU state handling in fletcher4
Browse files Browse the repository at this point in the history
Currently calls to kfpu_begin() and kfpu_end() are split between
the init() and fini() functions of the particular SIMD
implementation. This was done in openzfs#14247 as an optimization measure
for the ABD adapter. Unfortunately the split complicates FPU
handling on platforms that use a local FPU state buffer, like
Windows and macOS.

To ease porting, we introduce a boolean struct member in
fletcher_4_ops_t, indicating use of the FPU, and move the FPU state
handling from the SIMD implementations to the call sites.

Reviewed-by: Tino Reichardt <milky-zfs@mcmilk.de>
Reviewed-by: Richard Yao <richard.yao@alumni.stonybrook.edu>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Attila Fülöp <attila@fueloep.org>
Closes openzfs#14600
  • Loading branch information
AttilaFueloep authored and lundman committed Mar 16, 2023
1 parent f0fbd0d commit 27b335c
Show file tree
Hide file tree
Showing 8 changed files with 33 additions and 9 deletions.
3 changes: 2 additions & 1 deletion include/zfs_fletcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,9 @@ typedef struct fletcher_4_func {
fletcher_4_fini_f fini_byteswap;
fletcher_4_compute_f compute_byteswap;
boolean_t (*valid)(void);
boolean_t uses_fpu;
const char *name;
} fletcher_4_ops_t;
} __attribute__((aligned(64))) fletcher_4_ops_t;

_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar_ops;
_ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_superscalar4_ops;
Expand Down
23 changes: 23 additions & 0 deletions module/zcommon/zfs_fletcher.c
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ static const fletcher_4_ops_t fletcher_4_scalar_ops = {
.fini_byteswap = fletcher_4_scalar_fini,
.compute_byteswap = fletcher_4_scalar_byteswap,
.valid = fletcher_4_scalar_valid,
.uses_fpu = B_FALSE,
.name = "scalar"
};

Expand Down Expand Up @@ -458,9 +459,15 @@ fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
fletcher_4_ctx_t ctx;
const fletcher_4_ops_t *ops = fletcher_4_impl_get();

if (ops->uses_fpu == B_TRUE) {
kfpu_begin();
}
ops->init_native(&ctx);
ops->compute_native(&ctx, buf, size);
ops->fini_native(&ctx, zcp);
if (ops->uses_fpu == B_TRUE) {
kfpu_end();
}
}

void
Expand Down Expand Up @@ -500,9 +507,15 @@ fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
fletcher_4_ctx_t ctx;
const fletcher_4_ops_t *ops = fletcher_4_impl_get();

if (ops->uses_fpu == B_TRUE) {
kfpu_begin();
}
ops->init_byteswap(&ctx);
ops->compute_byteswap(&ctx, buf, size);
ops->fini_byteswap(&ctx, zcp);
if (ops->uses_fpu == B_TRUE) {
kfpu_end();
}
}

void
Expand Down Expand Up @@ -661,6 +674,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \
fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \
fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
fletcher_4_fastest_impl.uses_fpu = src->uses_fpu; \
}

#define FLETCHER_4_BENCH_NS (MSEC2NSEC(1)) /* 1ms */
Expand Down Expand Up @@ -816,10 +830,14 @@ abd_fletcher_4_init(zio_abd_checksum_data_t *cdp)
const fletcher_4_ops_t *ops = fletcher_4_impl_get();
cdp->acd_private = (void *) ops;

if (ops->uses_fpu == B_TRUE) {
kfpu_begin();
}
if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)
ops->init_native(cdp->acd_ctx);
else
ops->init_byteswap(cdp->acd_ctx);

}

static void
Expand All @@ -833,8 +851,13 @@ abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp)
ops->fini_native(cdp->acd_ctx, cdp->acd_zcp);
else
ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp);

if (ops->uses_fpu == B_TRUE) {
kfpu_end();
}
}


static void
abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size,
zio_abd_checksum_data_t *cdp)
Expand Down
3 changes: 1 addition & 2 deletions module/zcommon/zfs_fletcher_aarch64_neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ ZFS_NO_SANITIZE_UNDEFINED
static void
fletcher_4_aarch64_neon_init(fletcher_4_ctx_t *ctx)
{
kfpu_begin();
memset(ctx->aarch64_neon, 0, 4 * sizeof (zfs_fletcher_aarch64_neon_t));
}

Expand All @@ -70,7 +69,6 @@ fletcher_4_aarch64_neon_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
8 * ctx->aarch64_neon[3].v[1] - 8 * ctx->aarch64_neon[2].v[1] +
ctx->aarch64_neon[1].v[1];
ZIO_SET_CHECKSUM(zcp, A, B, C, D);
kfpu_end();
}

#define NEON_INIT_LOOP() \
Expand Down Expand Up @@ -205,6 +203,7 @@ const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
.compute_byteswap = fletcher_4_aarch64_neon_byteswap,
.fini_byteswap = fletcher_4_aarch64_neon_fini,
.valid = fletcher_4_aarch64_neon_valid,
.uses_fpu = B_TRUE,
.name = "aarch64_neon"
};

Expand Down
4 changes: 2 additions & 2 deletions module/zcommon/zfs_fletcher_avx512.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ ZFS_NO_SANITIZE_UNDEFINED
static void
fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx)
{
kfpu_begin();
memset(ctx->avx512, 0, 4 * sizeof (zfs_fletcher_avx512_t));
}

Expand Down Expand Up @@ -73,7 +72,6 @@ fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
}

ZIO_SET_CHECKSUM(zcp, A, B, C, D);
kfpu_end();
}

#define FLETCHER_4_AVX512_RESTORE_CTX(ctx) \
Expand Down Expand Up @@ -166,6 +164,7 @@ const fletcher_4_ops_t fletcher_4_avx512f_ops = {
.fini_byteswap = fletcher_4_avx512f_fini,
.compute_byteswap = fletcher_4_avx512f_byteswap,
.valid = fletcher_4_avx512f_valid,
.uses_fpu = B_TRUE,
.name = "avx512f"
};

Expand Down Expand Up @@ -216,6 +215,7 @@ const fletcher_4_ops_t fletcher_4_avx512bw_ops = {
.fini_byteswap = fletcher_4_avx512f_fini,
.compute_byteswap = fletcher_4_avx512bw_byteswap,
.valid = fletcher_4_avx512bw_valid,
.uses_fpu = B_TRUE,
.name = "avx512bw"
};
#endif
Expand Down
3 changes: 1 addition & 2 deletions module/zcommon/zfs_fletcher_intel.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ ZFS_NO_SANITIZE_UNDEFINED
static void
fletcher_4_avx2_init(fletcher_4_ctx_t *ctx)
{
kfpu_begin();
memset(ctx->avx, 0, 4 * sizeof (zfs_fletcher_avx_t));
}

Expand Down Expand Up @@ -82,7 +81,6 @@ fletcher_4_avx2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
64 * ctx->avx[3].v[3];

ZIO_SET_CHECKSUM(zcp, A, B, C, D);
kfpu_end();
}

#define FLETCHER_4_AVX2_RESTORE_CTX(ctx) \
Expand Down Expand Up @@ -163,6 +161,7 @@ const fletcher_4_ops_t fletcher_4_avx2_ops = {
.fini_byteswap = fletcher_4_avx2_fini,
.compute_byteswap = fletcher_4_avx2_byteswap,
.valid = fletcher_4_avx2_valid,
.uses_fpu = B_TRUE,
.name = "avx2"
};

Expand Down
4 changes: 2 additions & 2 deletions module/zcommon/zfs_fletcher_sse.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ ZFS_NO_SANITIZE_UNDEFINED
static void
fletcher_4_sse2_init(fletcher_4_ctx_t *ctx)
{
kfpu_begin();
memset(ctx->sse, 0, 4 * sizeof (zfs_fletcher_sse_t));
}

Expand Down Expand Up @@ -81,7 +80,6 @@ fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
8 * ctx->sse[2].v[1] + ctx->sse[1].v[1];

ZIO_SET_CHECKSUM(zcp, A, B, C, D);
kfpu_end();
}

#define FLETCHER_4_SSE_RESTORE_CTX(ctx) \
Expand Down Expand Up @@ -164,6 +162,7 @@ const fletcher_4_ops_t fletcher_4_sse2_ops = {
.fini_byteswap = fletcher_4_sse2_fini,
.compute_byteswap = fletcher_4_sse2_byteswap,
.valid = fletcher_4_sse2_valid,
.uses_fpu = B_TRUE,
.name = "sse2"
};

Expand Down Expand Up @@ -218,6 +217,7 @@ const fletcher_4_ops_t fletcher_4_ssse3_ops = {
.fini_byteswap = fletcher_4_sse2_fini,
.compute_byteswap = fletcher_4_ssse3_byteswap,
.valid = fletcher_4_ssse3_valid,
.uses_fpu = B_TRUE,
.name = "ssse3"
};

Expand Down
1 change: 1 addition & 0 deletions module/zcommon/zfs_fletcher_superscalar.c
Original file line number Diff line number Diff line change
Expand Up @@ -163,5 +163,6 @@ const fletcher_4_ops_t fletcher_4_superscalar_ops = {
.compute_byteswap = fletcher_4_superscalar_byteswap,
.fini_byteswap = fletcher_4_superscalar_fini,
.valid = fletcher_4_superscalar_valid,
.uses_fpu = B_FALSE,
.name = "superscalar"
};
1 change: 1 addition & 0 deletions module/zcommon/zfs_fletcher_superscalar4.c
Original file line number Diff line number Diff line change
Expand Up @@ -229,5 +229,6 @@ const fletcher_4_ops_t fletcher_4_superscalar4_ops = {
.compute_byteswap = fletcher_4_superscalar4_byteswap,
.fini_byteswap = fletcher_4_superscalar4_fini,
.valid = fletcher_4_superscalar4_valid,
.uses_fpu = B_FALSE,
.name = "superscalar4"
};

0 comments on commit 27b335c

Please sign in to comment.