ARM-software · solidpixel · Jul 17, 2024 · May 16, 2024 · May 17, 2024 · Jun 7, 2024
diff --git a/Source/UnitTest/test_simd.cpp b/Source/UnitTest/test_simd.cpp
diff --git a/Source/astcenc_decompress_symbolic.cpp b/Source/astcenc_decompress_symbolic.cpp
@@ -110,7 +110,7 @@ void unpack_weights(
  {
  vint summed_value(8);
  vint weight_count(di.texel_weight_count + i);
- int max_weight_count = hmax(weight_count).lane<0>();
+ int max_weight_count = hmax_s(weight_count);
 
  promise(max_weight_count > 0);
  for (int j = 0; j < max_weight_count; j++)
@@ -145,7 +145,7 @@ void unpack_weights(
  vint sum_plane2(8);
 
  vint weight_count(di.texel_weight_count + i);
- int max_weight_count = hmax(weight_count).lane<0>();
+ int max_weight_count = hmax_s(weight_count);
 
  promise(max_weight_count > 0);
  for (int j = 0; j < max_weight_count; j++)

diff --git a/Source/astcenc_ideal_endpoints_and_weights.cpp b/Source/astcenc_ideal_endpoints_and_weights.cpp
@@ -889,7 +889,7 @@ void compute_ideal_weights_for_decimation(
 
  // Accumulate error weighting of all the texels using this weight
  vint weight_texel_count(di.weight_texel_count + i);
- unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
+ unsigned int max_texel_count = hmax_s(weight_texel_count);
  promise(max_texel_count > 0);
 
  for (unsigned int j = 0; j < max_texel_count; j++)
@@ -947,7 +947,7 @@ void compute_ideal_weights_for_decimation(
 
  // Accumulate error weighting of all the texels using this weight
  vint weight_texel_count(di.weight_texel_count + i);
- unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
+ unsigned int max_texel_count = hmax_s(weight_texel_count);
  promise(max_texel_count > 0);
 
  for (unsigned int j = 0; j < max_texel_count; j++)

diff --git a/Source/astcenc_pick_best_endpoint_format.cpp b/Source/astcenc_pick_best_endpoint_format.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2022 Arm Limited
+// Copyright 2011-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -1306,8 +1306,8 @@ unsigned int compute_ideal_endpoint_formats(
  // Pick best mode from the SIMD result, using lowest matching index to ensure invariance
  vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error);
  vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error);
- vbest_error_index = hmin(vbest_error_index);
- int best_error_index = vbest_error_index.lane<0>();
+
+ int best_error_index = hmin_s(vbest_error_index);
 
  best_error_weights[i] = best_error_index;
 

diff --git a/Source/astcenc_vecmathlib_avx2_8.h b/Source/astcenc_vecmathlib_avx2_8.h
@@ -74,18 +74,6 @@ struct vfloat8
  m = _mm256_set1_ps(a);
  }
 
- /**
- * @brief Construct from 8 scalar values.
- *
- * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
- */
- ASTCENC_SIMD_INLINE explicit vfloat8(
- float a, float b, float c, float d,
- float e, float f, float g, float h)
- {
- m = _mm256_set_ps(h, g, f, e, d, c, b, a);
- }
-
  /**
  * @brief Construct from an existing SIMD register.
  */
@@ -94,20 +82,6 @@ struct vfloat8
  m = a;
  }
 
- /**
- * @brief Get the scalar value of a single lane.
- */
- template <int l> ASTCENC_SIMD_INLINE float lane() const
- {
- #if !defined(__clang__) && defined(_MSC_VER)
- return m.m256_f32[l];
- #else
- union { __m256 m; float f[8]; } cvt;
- cvt.m = m;
- return cvt.f[l];
- #endif
- }
-
  /**
  * @brief Factory that returns a vector of zeros.
  */
@@ -132,14 +106,6 @@ struct vfloat8
  return vfloat8(_mm256_load_ps(p));
  }
 
- /**
- * @brief Factory that returns a vector containing the lane IDs.
- */
- static ASTCENC_SIMD_INLINE vfloat8 lane_id()
- {
- return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0));
- }
-
  /**
  * @brief The vector ...
  */
@@ -190,18 +156,6 @@ struct vint8
  m = _mm256_set1_epi32(a);
  }
 
- /**
- * @brief Construct from 8 scalar values.
- *
- * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
- */
- ASTCENC_SIMD_INLINE explicit vint8(
- int a, int b, int c, int d,
- int e, int f, int g, int h)
- {
- m = _mm256_set_epi32(h, g, f, e, d, c, b, a);
- }
-
  /**
  * @brief Construct from an existing SIMD register.
  */
@@ -210,20 +164,6 @@ struct vint8
  m = a;
  }
 
- /**
- * @brief Get the scalar from a single lane.
- */
- template <int l> ASTCENC_SIMD_INLINE int lane() const
- {
- #if !defined(__clang__) && defined(_MSC_VER)
- return m.m256i_i32[l];
- #else
- union { __m256i m; int f[8]; } cvt;
- cvt.m = m;
- return cvt.f[l];
- #endif
- }
-
  /**
  * @brief Factory that returns a vector of zeros.
  */
@@ -528,6 +468,14 @@ ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
  return vmin;
 }
 
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmin_s(vint8 a)
+{
+ return _mm256_cvtsi256_si32(hmin(a).m);
+}
+
 /**
  * @brief Return the horizontal maximum of a vector.
  */
@@ -543,6 +491,14 @@ ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
  return vmax;
 }
 
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmax_s(vint8 a)
+{
+ return _mm256_cvtsi256_si32(hmax(a).m);
+}
+
 /**
  * @brief Store a vector to a 16B aligned memory address.
  */
@@ -570,14 +526,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
  _mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
 }
 
-/**
- * @brief Gather N (vector width) indices from the array.
- */
-ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices)
-{
- return vint8(_mm256_i32gather_epi32(base, indices.m, 4));
-}
-
 /**
  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  */
@@ -786,19 +734,6 @@ ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
  return a;
 }
 
-/**
- * @brief Return a clamped value between 0.0f and max.
- *
- * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
- * be returned for that lane.
- */
-ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a)
-{
- a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
- a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
- return a;
-}
-
 /**
  * @brief Return a clamped value between 0.0f and 1.0f.
  *
@@ -857,7 +792,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
  */
 ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
 {
- return hmin(a).lane<0>();
+ return _mm256_cvtss_f32(hmin(a).m);
 }
 
 /**
@@ -887,7 +822,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
  */
 ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
 {
- return hmax(a).lane<0>();
+ return _mm256_cvtss_f32(hmax(a).m);
 }
 
 /**
@@ -1146,7 +1081,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3
  * @brief Return a vector of interleaved RGBA data.
  *
  * Input vectors have the value stored in the bottom 8 bits of each lane,
- * with high  bits set to zero.
+ * with high bits set to zero.
  *
  * Output vector stores a single RGBA texel packed in each lane.
  */

diff --git a/Source/astcenc_vecmathlib_common_4.h b/Source/astcenc_vecmathlib_common_4.h
@@ -129,6 +129,22 @@ ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
  return a.lane<0>() + a.lane<1>() + a.lane<2>();
 }
 
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmin_s(vint4 a)
+{
+ return hmin(a).lane<0>();
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmax_s(vint4 a)
+{
+ return hmax(a).lane<0>();
+}
+
 // ============================================================================
 // vfloat4 operators and functions
 // ============================================================================
@@ -222,18 +238,6 @@ ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
  return min(max(a, minv), maxv);
 }
 
-/**
- * @brief Return the clamped value between 0.0f and max.
- *
- * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
- * be returned for that lane.
- */
-ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
-{
- // Do not reorder - second operand will return if either is NaN
- return min(max(a, vfloat4::zero()), maxv);
-}
-
 /**
  * @brief Return the clamped value between 0.0f and 1.0f.
  *

diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h
@@ -134,15 +134,6 @@ struct vfloat4
  return vfloat4(vld1q_f32(p));
  }
 
- /**
- * @brief Factory that returns a vector containing the lane IDs.
- */
- static ASTCENC_SIMD_INLINE vfloat4 lane_id()
- {
- alignas(16) float data[4] { 0.0f, 1.0f, 2.0f, 3.0f };
- return vfloat4(vld1q_f32(data));
- }
-
  /**
  * @brief Return a swizzled float 2.
  */
@@ -611,21 +602,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
  vst1q_lane_s32(reinterpret_cast<int32_t*>(p), a.m, 0);
 }
 
-/**
- * @brief Gather N (vector width) indices from the array.
- */
-ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
-{
- alignas(16) int idx[4];
- storea(indices, idx);
- alignas(16) int vals[4];
- vals[0] = base[idx[0]];
- vals[1] = base[idx[1]];
- vals[2] = base[idx[2]];
- vals[3] = base[idx[3]];
- return vint4(vals);
-}
-
 /**
  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  */

diff --git a/Source/astcenc_vecmathlib_none_4.h b/Source/astcenc_vecmathlib_none_4.h
@@ -139,14 +139,6 @@ struct vfloat4
  return vfloat4(p);
  }
 
- /**
- * @brief Factory that returns a vector containing the lane IDs.
- */
- static ASTCENC_SIMD_INLINE vfloat4 lane_id()
- {
- return vfloat4(0.0f, 1.0f, 2.0f, 3.0f);
- }
-
  /**
  * @brief Return a swizzled float 2.
  */
@@ -684,17 +676,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
  std::memcpy(p, a.m, sizeof(uint8_t) * 4);
 }
 
-/**
- * @brief Gather N (vector width) indices from the array.
- */
-ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
-{
- return vint4(base[indices.m[0]],
- base[indices.m[1]],
- base[indices.m[2]],
- base[indices.m[3]]);
-}
-
 /**
  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  */

diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h
@@ -142,14 +142,6 @@ struct vfloat4
  return vfloat4(_mm_load_ps(p));
  }
 
- /**
- * @brief Factory that returns a vector containing the lane IDs.
- */
- static ASTCENC_SIMD_INLINE vfloat4 lane_id()
- {
- return vfloat4(_mm_set_ps(3, 2, 1, 0));
- }
-
  /**
  * @brief Return a swizzled float 2.
  */
@@ -663,20 +655,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
  _mm_store_ss(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
 }
 
-/**
- * @brief Gather N (vector width) indices from the array.
- */
-ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
-{
-#if ASTCENC_AVX >= 2
- return vint4(_mm_i32gather_epi32(base, indices.m, 4));
-#else
- alignas(16) int idx[4];
- storea(indices, idx);
- return vint4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);
-#endif
-}
-
 /**
  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  */

diff --git a/Source/astcenc_weight_align.cpp b/Source/astcenc_weight_align.cpp
@@ -164,7 +164,7 @@ static void compute_lowest_and_highest_weight(
  promise(weight_count > 0);
  promise(max_angular_steps > 0);
 
- vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
+ vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);
 
  // Arrays are ANGULAR_STEPS long, so always safe to run full vectors
  for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)