Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify SVE table lookup #494

Merged
merged 14 commits into from
Aug 15, 2024
3 changes: 2 additions & 1 deletion Source/UnitTest/test_decode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ TEST(decode, decode12x12)

status = astcenc_decompress_image(context, data, 16, &image, &swizzle, 0);
EXPECT_EQ(status, ASTCENC_SUCCESS);

#if 0
solidpixel marked this conversation as resolved.
Show resolved Hide resolved
for (int y = 0; y < 12; y++)
{
for (int x = 0; x < 12; x++)
Expand All @@ -74,6 +74,7 @@ TEST(decode, decode12x12)
printf("[%2dx%2d] = %03d, %03d, %03d, %03d\n", x, y, pixel[0], pixel[1], pixel[2], pixel[3]);
}
}
#endif
}

}
48 changes: 19 additions & 29 deletions Source/astcenc_vecmathlib_sve_8.h
Original file line number Diff line number Diff line change
Expand Up @@ -910,22 +910,22 @@ ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
* Table structure for a 16x 8-bit entry table.
*/
struct vtable8_16x8 {
vint8 t0;
svuint8_8_t t0;
solidpixel marked this conversation as resolved.
Show resolved Hide resolved
};

/*
* Table structure for a 32x 8-bit entry table.
*/
struct vtable8_32x8 {
vint8 t0;
svuint8_8_t t0;
};

/*
* Table structure for a 64x 8-bit entry table.
*/
struct vtable8_64x8 {
vint8 t0;
vint8 t1;
svuint8_8_t t0;
svuint8_8_t t1;
};

/**
Expand All @@ -936,7 +936,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
const uint8_t* data
) {
// Top half of register will be zeros
table.t0 = vint8(svld1_u8(svptrue_pat_b8(SV_VL16), data));
table.t0 = svld1_u8(svptrue_pat_b8(SV_VL16), data);
}

/**
Expand All @@ -946,7 +946,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
vtable8_32x8& table,
const uint8_t* data
) {
table.t0 = vint8(svld1_u8(svptrue_b8(), data));
table.t0 = svld1_u8(svptrue_b8(), data);
}

/**
Expand All @@ -956,8 +956,8 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
vtable8_64x8& table,
const uint8_t* data
) {
table.t0 = vint8(svld1_u8(svptrue_b8(), data));
table.t1 = vint8(svld1_u8(svptrue_b8(), data + 32));
table.t0 = svld1_u8(svptrue_b8(), data);
table.t1 = svld1_u8(svptrue_b8(), data + 32);
}

/**
Expand All @@ -969,11 +969,9 @@ ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
) {
// Set index byte above max index for unused bytes so table lookup returns zero
svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));

svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked);
svuint8_8_t tbl_bytes = svreinterpret_u8_s32(tbl.t0.m);
svuint8_8_t result = svtbl_u8(tbl_bytes, idx_bytes);

svuint8_8_t result = svtbl_u8(tbl.t0, idx_bytes);
return vint8(svreinterpret_s32_u8(result));
}

Expand All @@ -986,40 +984,32 @@ ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
) {
// Set index byte above max index for unused bytes so table lookup returns zero
svint32_8_t idx_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));

svuint8_8_t idx_bytes = svreinterpret_u8_s32(idx_masked);
svuint8_8_t tbl_bytes = svreinterpret_u8_s32(tbl.t0.m);
svuint8_8_t result = svtbl_u8(tbl_bytes, idx_bytes);

svuint8_8_t result = svtbl_u8(tbl.t0, idx_bytes);
return vint8(svreinterpret_s32_u8(result));
}

/**
* @brief Perform a vtable lookup in a 64x 8-bit table with 32-bit indices.
*
* Future: SVE2 can directly do svtbl2_u8() for a two register table.
*/
ASTCENC_SIMD_INLINE vint8 vtable_lookup_32bit(
const vtable8_64x8& tbl,
vint8 idx
) {
// Set index byte above max index for unused bytes so table lookup returns zero
svint32_8_t literal32 = svdup_s32(32);
svbool_8_t idx_lo_select = svcmplt(svptrue_b32(), idx.m, literal32);
svint32_8_t idx_lo_masked = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));
svint32_8_t idx_hi_masked = svorr_s32_x(svptrue_b32(), idx.m - literal32, svdup_s32(0xFFFFFF00));
svint32_8_t idxm = svorr_s32_x(svptrue_b32(), idx.m, svdup_s32(0xFFFFFF00));

svuint8_8_t idx_lo_bytes = svreinterpret_u8_s32(idx_lo_masked);
svuint8_8_t idx_hi_bytes = svreinterpret_u8_s32(idx_hi_masked);
svuint8_8_t idxm8 = svreinterpret_u8_s32(idxm);
svuint8_8_t t0_lookup = svtbl_u8(tbl.t0, idxm8);

svuint8_8_t tbl0_bytes = svreinterpret_u8_s32(tbl.t0.m);
svuint8_8_t tbl1_bytes = svreinterpret_u8_s32(tbl.t1.m);
idxm8 = svsub_u8_x(svptrue_b8(), idxm8, svdup_u8(32));
svuint8_8_t t1_lookup = svtbl_u8(tbl.t1, idxm8);

svint32_8_t t0_lookup = svreinterpret_s32_u8(svtbl_u8(tbl0_bytes, idx_lo_bytes));
svint32_8_t t1_lookup = svreinterpret_s32_u8(svtbl_u8(tbl1_bytes, idx_hi_bytes));

svint32_8_t result = svsel_s32(idx_lo_select, t0_lookup, t1_lookup);

// Future: SVE2 can directly do svtbl2_u8() for a two register table
return vint8(result);
svuint8_8_t result = svorr_u8_x(svptrue_b32(), t0_lookup, t1_lookup);
return vint8(svreinterpret_s32_u8(result));
}

/**
Expand Down