Skip to content

Commit

Permalink
Vector library cleanup (#473)
Browse files Browse the repository at this point in the history
The astcenc vector library effectively has two different class APIs:

- A 4-wide API which is used via explicit width types (e.g. vfloat4).
- A vector length agnostic API, which is used via implicit width types 
  (e.g. vfloat) in the codec that are resolved at compile time.

For historical reasons the classes that are only used as VLA types 
(e.g. vfloat8 for AVX2) implement more API than needed because it was
inherited from the original 4-wide implementation. This makes adding new
VLA implementations (e.g. Arm SVE) more expensive than needed.

This PR doesn't add SVE support, but minimizes the VLA API as a 
precursor to doing so. The main changes are:

* Remove VLA indexable .lane<N>() reads.
* Remove VLA float lane_id() factory functions.
* Replace VLA use of .lane<0>() with dedicated functions, e.g. use
  hmax_s() rather than hmax.lane<0>().
  • Loading branch information
solidpixel authored Jul 17, 2024
1 parent ffdc45e commit 69bc17b
Show file tree
Hide file tree
Showing 10 changed files with 1,123 additions and 1,029 deletions.
1,940 changes: 1,080 additions & 860 deletions Source/UnitTest/test_simd.cpp

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions Source/astcenc_decompress_symbolic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ void unpack_weights(
{
vint summed_value(8);
vint weight_count(di.texel_weight_count + i);
int max_weight_count = hmax(weight_count).lane<0>();
int max_weight_count = hmax_s(weight_count);

promise(max_weight_count > 0);
for (int j = 0; j < max_weight_count; j++)
Expand Down Expand Up @@ -145,7 +145,7 @@ void unpack_weights(
vint sum_plane2(8);

vint weight_count(di.texel_weight_count + i);
int max_weight_count = hmax(weight_count).lane<0>();
int max_weight_count = hmax_s(weight_count);

promise(max_weight_count > 0);
for (int j = 0; j < max_weight_count; j++)
Expand Down
4 changes: 2 additions & 2 deletions Source/astcenc_ideal_endpoints_and_weights.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -889,7 +889,7 @@ void compute_ideal_weights_for_decimation(

// Accumulate error weighting of all the texels using this weight
vint weight_texel_count(di.weight_texel_count + i);
unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
unsigned int max_texel_count = hmax_s(weight_texel_count);
promise(max_texel_count > 0);

for (unsigned int j = 0; j < max_texel_count; j++)
Expand Down Expand Up @@ -947,7 +947,7 @@ void compute_ideal_weights_for_decimation(

// Accumulate error weighting of all the texels using this weight
vint weight_texel_count(di.weight_texel_count + i);
unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
unsigned int max_texel_count = hmax_s(weight_texel_count);
promise(max_texel_count > 0);

for (unsigned int j = 0; j < max_texel_count; j++)
Expand Down
6 changes: 3 additions & 3 deletions Source/astcenc_pick_best_endpoint_format.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
Expand Down Expand Up @@ -1306,8 +1306,8 @@ unsigned int compute_ideal_endpoint_formats(
// Pick best mode from the SIMD result, using lowest matching index to ensure invariance
vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error);
vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error);
vbest_error_index = hmin(vbest_error_index);
int best_error_index = vbest_error_index.lane<0>();

int best_error_index = hmin_s(vbest_error_index);

best_error_weights[i] = best_error_index;

Expand Down
103 changes: 19 additions & 84 deletions Source/astcenc_vecmathlib_avx2_8.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,18 +74,6 @@ struct vfloat8
m = _mm256_set1_ps(a);
}

/**
* @brief Construct from 8 scalar values.
*
* The value of @c a is stored to lane 0 (LSB) in the SIMD register.
*/
ASTCENC_SIMD_INLINE explicit vfloat8(
float a, float b, float c, float d,
float e, float f, float g, float h)
{
m = _mm256_set_ps(h, g, f, e, d, c, b, a);
}

/**
* @brief Construct from an existing SIMD register.
*/
Expand All @@ -94,20 +82,6 @@ struct vfloat8
m = a;
}

/**
* @brief Get the scalar value of a single lane.
*/
template <int l> ASTCENC_SIMD_INLINE float lane() const
{
#if !defined(__clang__) && defined(_MSC_VER)
return m.m256_f32[l];
#else
union { __m256 m; float f[8]; } cvt;
cvt.m = m;
return cvt.f[l];
#endif
}

/**
* @brief Factory that returns a vector of zeros.
*/
Expand All @@ -132,14 +106,6 @@ struct vfloat8
return vfloat8(_mm256_load_ps(p));
}

/**
* @brief Factory that returns a vector containing the lane IDs.
*/
static ASTCENC_SIMD_INLINE vfloat8 lane_id()
{
return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0));
}

/**
* @brief The vector ...
*/
Expand Down Expand Up @@ -190,18 +156,6 @@ struct vint8
m = _mm256_set1_epi32(a);
}

/**
* @brief Construct from 8 scalar values.
*
* The value of @c a is stored to lane 0 (LSB) in the SIMD register.
*/
ASTCENC_SIMD_INLINE explicit vint8(
int a, int b, int c, int d,
int e, int f, int g, int h)
{
m = _mm256_set_epi32(h, g, f, e, d, c, b, a);
}

/**
* @brief Construct from an existing SIMD register.
*/
Expand All @@ -210,20 +164,6 @@ struct vint8
m = a;
}

/**
* @brief Get the scalar from a single lane.
*/
template <int l> ASTCENC_SIMD_INLINE int lane() const
{
#if !defined(__clang__) && defined(_MSC_VER)
return m.m256i_i32[l];
#else
union { __m256i m; int f[8]; } cvt;
cvt.m = m;
return cvt.f[l];
#endif
}

/**
* @brief Factory that returns a vector of zeros.
*/
Expand Down Expand Up @@ -528,6 +468,14 @@ ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
return vmin;
}

/**
* @brief Return the horizontal minimum of a vector.
*/
ASTCENC_SIMD_INLINE int hmin_s(vint8 a)
{
return _mm256_cvtsi256_si32(hmin(a).m);
}

/**
* @brief Return the horizontal maximum of a vector.
*/
Expand All @@ -543,6 +491,14 @@ ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
return vmax;
}

/**
* @brief Return the horizontal maximum of a vector.
*/
ASTCENC_SIMD_INLINE int hmax_s(vint8 a)
{
return _mm256_cvtsi256_si32(hmax(a).m);
}

/**
* @brief Store a vector to a 16B aligned memory address.
*/
Expand Down Expand Up @@ -570,14 +526,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
_mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
}

/**
* @brief Gather N (vector width) indices from the array.
*/
ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices)
{
return vint8(_mm256_i32gather_epi32(base, indices.m, 4));
}

/**
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
*/
Expand Down Expand Up @@ -786,19 +734,6 @@ ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
return a;
}

/**
* @brief Return a clamped value between 0.0f and max.
*
* It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
* be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a)
{
a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
return a;
}

/**
* @brief Return a clamped value between 0.0f and 1.0f.
*
Expand Down Expand Up @@ -857,7 +792,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
*/
ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
{
return hmin(a).lane<0>();
return _mm256_cvtss_f32(hmin(a).m);
}

/**
Expand Down Expand Up @@ -887,7 +822,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
*/
ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
{
return hmax(a).lane<0>();
return _mm256_cvtss_f32(hmax(a).m);
}

/**
Expand Down Expand Up @@ -1146,7 +1081,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3
* @brief Return a vector of interleaved RGBA data.
*
* Input vectors have the value stored in the bottom 8 bits of each lane,
* with high bits set to zero.
* with high bits set to zero.
*
* Output vector stores a single RGBA texel packed in each lane.
*/
Expand Down
28 changes: 16 additions & 12 deletions Source/astcenc_vecmathlib_common_4.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,22 @@ ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
return a.lane<0>() + a.lane<1>() + a.lane<2>();
}

/**
* @brief Return the horizontal minimum of a vector.
*/
ASTCENC_SIMD_INLINE int hmin_s(vint4 a)
{
return hmin(a).lane<0>();
}

/**
* @brief Return the horizontal maximum of a vector.
*/
ASTCENC_SIMD_INLINE int hmax_s(vint4 a)
{
return hmax(a).lane<0>();
}

// ============================================================================
// vfloat4 operators and functions
// ============================================================================
Expand Down Expand Up @@ -222,18 +238,6 @@ ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
return min(max(a, minv), maxv);
}

/**
* @brief Return the clamped value between 0.0f and max.
*
* It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
* be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
{
// Do not reorder - second operand will return if either is NaN
return min(max(a, vfloat4::zero()), maxv);
}

/**
* @brief Return the clamped value between 0.0f and 1.0f.
*
Expand Down
24 changes: 0 additions & 24 deletions Source/astcenc_vecmathlib_neon_4.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,15 +134,6 @@ struct vfloat4
return vfloat4(vld1q_f32(p));
}

/**
* @brief Factory that returns a vector containing the lane IDs.
*/
static ASTCENC_SIMD_INLINE vfloat4 lane_id()
{
alignas(16) float data[4] { 0.0f, 1.0f, 2.0f, 3.0f };
return vfloat4(vld1q_f32(data));
}

/**
* @brief Return a swizzled float 2.
*/
Expand Down Expand Up @@ -611,21 +602,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
vst1q_lane_s32(reinterpret_cast<int32_t*>(p), a.m, 0);
}

/**
* @brief Gather N (vector width) indices from the array.
*/
ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
{
alignas(16) int idx[4];
storea(indices, idx);
alignas(16) int vals[4];
vals[0] = base[idx[0]];
vals[1] = base[idx[1]];
vals[2] = base[idx[2]];
vals[3] = base[idx[3]];
return vint4(vals);
}

/**
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
*/
Expand Down
19 changes: 0 additions & 19 deletions Source/astcenc_vecmathlib_none_4.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,14 +139,6 @@ struct vfloat4
return vfloat4(p);
}

/**
* @brief Factory that returns a vector containing the lane IDs.
*/
static ASTCENC_SIMD_INLINE vfloat4 lane_id()
{
return vfloat4(0.0f, 1.0f, 2.0f, 3.0f);
}

/**
* @brief Return a swizzled float 2.
*/
Expand Down Expand Up @@ -684,17 +676,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
std::memcpy(p, a.m, sizeof(uint8_t) * 4);
}

/**
* @brief Gather N (vector width) indices from the array.
*/
ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
{
return vint4(base[indices.m[0]],
base[indices.m[1]],
base[indices.m[2]],
base[indices.m[3]]);
}

/**
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
*/
Expand Down
22 changes: 0 additions & 22 deletions Source/astcenc_vecmathlib_sse_4.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,14 +142,6 @@ struct vfloat4
return vfloat4(_mm_load_ps(p));
}

/**
* @brief Factory that returns a vector containing the lane IDs.
*/
static ASTCENC_SIMD_INLINE vfloat4 lane_id()
{
return vfloat4(_mm_set_ps(3, 2, 1, 0));
}

/**
* @brief Return a swizzled float 2.
*/
Expand Down Expand Up @@ -663,20 +655,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
_mm_store_ss(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
}

/**
* @brief Gather N (vector width) indices from the array.
*/
ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
{
#if ASTCENC_AVX >= 2
return vint4(_mm_i32gather_epi32(base, indices.m, 4));
#else
alignas(16) int idx[4];
storea(indices, idx);
return vint4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);
#endif
}

/**
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
*/
Expand Down
2 changes: 1 addition & 1 deletion Source/astcenc_weight_align.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ static void compute_lowest_and_highest_weight(
promise(weight_count > 0);
promise(max_angular_steps > 0);

vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);

// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
Expand Down

0 comments on commit 69bc17b

Please sign in to comment.