Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vector library cleanup #473

Merged
merged 10 commits into from
Jul 17, 2024
1,940 changes: 1,080 additions & 860 deletions Source/UnitTest/test_simd.cpp

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions Source/astcenc_decompress_symbolic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ void unpack_weights(
{
vint summed_value(8);
vint weight_count(di.texel_weight_count + i);
int max_weight_count = hmax(weight_count).lane<0>();
int max_weight_count = hmax_s(weight_count);

promise(max_weight_count > 0);
for (int j = 0; j < max_weight_count; j++)
Expand Down Expand Up @@ -145,7 +145,7 @@ void unpack_weights(
vint sum_plane2(8);

vint weight_count(di.texel_weight_count + i);
int max_weight_count = hmax(weight_count).lane<0>();
int max_weight_count = hmax_s(weight_count);

promise(max_weight_count > 0);
for (int j = 0; j < max_weight_count; j++)
Expand Down
4 changes: 2 additions & 2 deletions Source/astcenc_ideal_endpoints_and_weights.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -889,7 +889,7 @@ void compute_ideal_weights_for_decimation(

// Accumulate error weighting of all the texels using this weight
vint weight_texel_count(di.weight_texel_count + i);
unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
unsigned int max_texel_count = hmax_s(weight_texel_count);
promise(max_texel_count > 0);

for (unsigned int j = 0; j < max_texel_count; j++)
Expand Down Expand Up @@ -947,7 +947,7 @@ void compute_ideal_weights_for_decimation(

// Accumulate error weighting of all the texels using this weight
vint weight_texel_count(di.weight_texel_count + i);
unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
unsigned int max_texel_count = hmax_s(weight_texel_count);
promise(max_texel_count > 0);

for (unsigned int j = 0; j < max_texel_count; j++)
Expand Down
6 changes: 3 additions & 3 deletions Source/astcenc_pick_best_endpoint_format.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
Expand Down Expand Up @@ -1306,8 +1306,8 @@ unsigned int compute_ideal_endpoint_formats(
// Pick best mode from the SIMD result, using lowest matching index to ensure invariance
vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error);
vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error);
vbest_error_index = hmin(vbest_error_index);
int best_error_index = vbest_error_index.lane<0>();

int best_error_index = hmin_s(vbest_error_index);

best_error_weights[i] = best_error_index;

Expand Down
103 changes: 19 additions & 84 deletions Source/astcenc_vecmathlib_avx2_8.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,18 +74,6 @@ struct vfloat8
m = _mm256_set1_ps(a);
}

/**
* @brief Construct from 8 scalar values.
*
* The value of @c a is stored to lane 0 (LSB) in the SIMD register.
*/
ASTCENC_SIMD_INLINE explicit vfloat8(
float a, float b, float c, float d,
float e, float f, float g, float h)
{
m = _mm256_set_ps(h, g, f, e, d, c, b, a);
}

/**
* @brief Construct from an existing SIMD register.
*/
Expand All @@ -94,20 +82,6 @@ struct vfloat8
m = a;
}

/**
* @brief Get the scalar value of a single lane.
*/
template <int l> ASTCENC_SIMD_INLINE float lane() const
{
#if !defined(__clang__) && defined(_MSC_VER)
return m.m256_f32[l];
#else
union { __m256 m; float f[8]; } cvt;
cvt.m = m;
return cvt.f[l];
#endif
}

/**
* @brief Factory that returns a vector of zeros.
*/
Expand All @@ -132,14 +106,6 @@ struct vfloat8
return vfloat8(_mm256_load_ps(p));
}

/**
* @brief Factory that returns a vector containing the lane IDs.
*/
static ASTCENC_SIMD_INLINE vfloat8 lane_id()
{
return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0));
}

/**
* @brief The vector ...
*/
Expand Down Expand Up @@ -190,18 +156,6 @@ struct vint8
m = _mm256_set1_epi32(a);
}

/**
* @brief Construct from 8 scalar values.
*
* The value of @c a is stored to lane 0 (LSB) in the SIMD register.
*/
ASTCENC_SIMD_INLINE explicit vint8(
int a, int b, int c, int d,
int e, int f, int g, int h)
{
m = _mm256_set_epi32(h, g, f, e, d, c, b, a);
}

/**
* @brief Construct from an existing SIMD register.
*/
Expand All @@ -210,20 +164,6 @@ struct vint8
m = a;
}

/**
* @brief Get the scalar from a single lane.
*/
template <int l> ASTCENC_SIMD_INLINE int lane() const
{
#if !defined(__clang__) && defined(_MSC_VER)
return m.m256i_i32[l];
#else
union { __m256i m; int f[8]; } cvt;
cvt.m = m;
return cvt.f[l];
#endif
}

/**
* @brief Factory that returns a vector of zeros.
*/
Expand Down Expand Up @@ -528,6 +468,14 @@ ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
return vmin;
}

/**
* @brief Return the horizontal minimum of a vector.
*/
ASTCENC_SIMD_INLINE int hmin_s(vint8 a)
{
return _mm256_cvtsi256_si32(hmin(a).m);
}

/**
* @brief Return the horizontal maximum of a vector.
*/
Expand All @@ -543,6 +491,14 @@ ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
return vmax;
}

/**
* @brief Return the horizontal maximum of a vector.
*/
ASTCENC_SIMD_INLINE int hmax_s(vint8 a)
{
return _mm256_cvtsi256_si32(hmax(a).m);
}

/**
* @brief Store a vector to a 16B aligned memory address.
*/
Expand Down Expand Up @@ -570,14 +526,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
_mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
}

/**
* @brief Gather N (vector width) indices from the array.
*/
ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices)
{
return vint8(_mm256_i32gather_epi32(base, indices.m, 4));
}

/**
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
*/
Expand Down Expand Up @@ -786,19 +734,6 @@ ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
return a;
}

/**
* @brief Return a clamped value between 0.0f and max.
*
* It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
* be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a)
{
a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
return a;
}

/**
* @brief Return a clamped value between 0.0f and 1.0f.
*
Expand Down Expand Up @@ -857,7 +792,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
*/
ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
{
return hmin(a).lane<0>();
return _mm256_cvtss_f32(hmin(a).m);
}

/**
Expand Down Expand Up @@ -887,7 +822,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
*/
ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
{
return hmax(a).lane<0>();
return _mm256_cvtss_f32(hmax(a).m);
}

/**
Expand Down Expand Up @@ -1146,7 +1081,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3
* @brief Return a vector of interleaved RGBA data.
*
* Input vectors have the value stored in the bottom 8 bits of each lane,
* with high bits set to zero.
* with high bits set to zero.
*
* Output vector stores a single RGBA texel packed in each lane.
*/
Expand Down
28 changes: 16 additions & 12 deletions Source/astcenc_vecmathlib_common_4.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,22 @@ ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
return a.lane<0>() + a.lane<1>() + a.lane<2>();
}

/**
* @brief Return the horizontal minimum of a vector.
*/
ASTCENC_SIMD_INLINE int hmin_s(vint4 a)
{
return hmin(a).lane<0>();
}

/**
* @brief Return the horizontal maximum of a vector.
*/
ASTCENC_SIMD_INLINE int hmax_s(vint4 a)
{
return hmax(a).lane<0>();
}

// ============================================================================
// vfloat4 operators and functions
// ============================================================================
Expand Down Expand Up @@ -222,18 +238,6 @@ ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
return min(max(a, minv), maxv);
}

/**
* @brief Return the clamped value between 0.0f and max.
*
* It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
* be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
{
// Do not reorder - second operand will return if either is NaN
return min(max(a, vfloat4::zero()), maxv);
}

/**
* @brief Return the clamped value between 0.0f and 1.0f.
*
Expand Down
24 changes: 0 additions & 24 deletions Source/astcenc_vecmathlib_neon_4.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,15 +134,6 @@ struct vfloat4
return vfloat4(vld1q_f32(p));
}

/**
* @brief Factory that returns a vector containing the lane IDs.
*/
static ASTCENC_SIMD_INLINE vfloat4 lane_id()
{
alignas(16) float data[4] { 0.0f, 1.0f, 2.0f, 3.0f };
return vfloat4(vld1q_f32(data));
}

/**
* @brief Return a swizzled float 2.
*/
Expand Down Expand Up @@ -611,21 +602,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
vst1q_lane_s32(reinterpret_cast<int32_t*>(p), a.m, 0);
}

/**
* @brief Gather N (vector width) indices from the array.
*/
ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
{
alignas(16) int idx[4];
storea(indices, idx);
alignas(16) int vals[4];
vals[0] = base[idx[0]];
vals[1] = base[idx[1]];
vals[2] = base[idx[2]];
vals[3] = base[idx[3]];
return vint4(vals);
}

/**
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
*/
Expand Down
19 changes: 0 additions & 19 deletions Source/astcenc_vecmathlib_none_4.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,14 +139,6 @@ struct vfloat4
return vfloat4(p);
}

/**
* @brief Factory that returns a vector containing the lane IDs.
*/
static ASTCENC_SIMD_INLINE vfloat4 lane_id()
{
return vfloat4(0.0f, 1.0f, 2.0f, 3.0f);
}

/**
* @brief Return a swizzled float 2.
*/
Expand Down Expand Up @@ -684,17 +676,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
std::memcpy(p, a.m, sizeof(uint8_t) * 4);
}

/**
* @brief Gather N (vector width) indices from the array.
*/
ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
{
return vint4(base[indices.m[0]],
base[indices.m[1]],
base[indices.m[2]],
base[indices.m[3]]);
}

/**
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
*/
Expand Down
22 changes: 0 additions & 22 deletions Source/astcenc_vecmathlib_sse_4.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,14 +142,6 @@ struct vfloat4
return vfloat4(_mm_load_ps(p));
}

/**
* @brief Factory that returns a vector containing the lane IDs.
*/
static ASTCENC_SIMD_INLINE vfloat4 lane_id()
{
return vfloat4(_mm_set_ps(3, 2, 1, 0));
}

/**
* @brief Return a swizzled float 2.
*/
Expand Down Expand Up @@ -663,20 +655,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
_mm_store_ss(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
}

/**
* @brief Gather N (vector width) indices from the array.
*/
ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
{
#if ASTCENC_AVX >= 2
return vint4(_mm_i32gather_epi32(base, indices.m, 4));
#else
alignas(16) int idx[4];
storea(indices, idx);
return vint4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);
#endif
}

/**
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
*/
Expand Down
2 changes: 1 addition & 1 deletion Source/astcenc_weight_align.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ static void compute_lowest_and_highest_weight(
promise(weight_count > 0);
promise(max_angular_steps > 0);

vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);

// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
Expand Down