Support for ARM SVE2. (#8051)

* Checkpoint SVE2 restart. * Remove dead code. Add new test. * Update cmake for new file. * Checkpoint progress on SVE2. * Checkpoint ARM SVE2 support. Passes correctness_simd_op_check_sve2 test at 128 and 256 bits. * Remove an opportunity for RISC V codegen to change due to SVE2 support. * Ensure SVE intrinsics get vscale vectors and non-SVE ones get fixed vectors. Use proper prefix for neon intrinsics. Comment cleanups. * Checkpoint SVE2 work. Generally passes test, though using both NEON and SVE2 with simd_op_check_sve2 fails as both posibilities need to be allowed for 128-bit or smaller operations. * Remove an unfavored implementation possibility. * Fix opcode recognition in test to handle some cases that show up. Change name of test class to avoid confusion. * Formatting fixes. Replace internal_error with nop return for CodeGen_LLVM::match_vector_type_scalable called on scalar. * Formatting fix. * Limit SVE2 test to LLVM 19. Remove dead code. * Fix a degenerate case asking for zero sized vectors via a HAlide type with lanes of zero, which is not correct. * Fix confusion about Neon64/Neon128 and make it clear this is just the width multiplier applied to intrinsics. * REmove extraneous commented out line. * Address some review feedback. Mostly comment fixes. * Fix missed conflict resolution. * Fix some TODOs in SVE code. Move utility function to Util.h and common code the other obvious use. * Formatting. * Add missed refactor change. * Add issue to TODO comment. * Remove TODOs that don't seem necessary. * Add issue for TODO. * Add issue for TODO. * Remove dubious looking FP to int code that was ifdef'ed out. Doesn't look like a TODO is needed anymore. * Add issues for TODOs. * Update simd_op_check_sve2.cpp * Make a deep copy of each piece of test IR so that we can parallelize * Fix two clang-tidy warnings * Remove try/catch block from simd-op-check-sve2 * Don't try to run SVE2 code if vector_bits doesn't match host. * Add support for fcvtm/p, make scalars go through pattern matching too (#8151) * Don't do arm neon instruction selection on scalars This revealed a bug. FindIntrinsics was not enabled for scalars anyway, so it was semi-pointless. --------- Co-authored-by: Zalman Stern <zalman@macbook-pro.lan> Co-authored-by: Steven Johnson <srj@google.com> Co-authored-by: Andrew Adams <andrew.b.adams@gmail.com>
halide · Mar 15, 2024 · 76a7dd4 · 76a7dd4
1 parent f841a27
commit 76a7dd4
Show file tree

Hide file tree

Showing 19 changed files with 2,836 additions and 348 deletions.
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
@@ -579,6 +579,13 @@ class CodeGen_LLVM : public IRVisitor {
  llvm::Constant *get_splat(int lanes, llvm::Constant *value,
  VectorTypeConstraint type_constraint = VectorTypeConstraint::None) const;
 
+ /** Make sure a value type has the same scalable/fixed vector type as a guide. */
+ // @{
+ llvm::Value *match_vector_type_scalable(llvm::Value *value, VectorTypeConstraint constraint);
+ llvm::Value *match_vector_type_scalable(llvm::Value *value, llvm::Type *guide);
+ llvm::Value *match_vector_type_scalable(llvm::Value *value, llvm::Value *guide);
+ // @}
+
  /** Support for generating LLVM vector predication intrinsics
  * ("@llvm.vp.*" and "@llvm.experimental.vp.*")
  */

diff --git a/src/Function.cpp b/src/Function.cpp
@@ -491,8 +491,10 @@ ExternFuncArgument deep_copy_extern_func_argument_helper(const ExternFuncArgumen
 } // namespace
 
 void Function::deep_copy(const FunctionPtr &copy, DeepCopyMap &copied_map) const {
- internal_assert(copy.defined() && contents.defined())
- << "Cannot deep-copy undefined Function\n";
+ internal_assert(copy.defined())
+ << "Cannot deep-copy to undefined Function\n";
+ internal_assert(contents.defined())
+ << "Cannot deep-copy from undefined Function\n";
 
  // Add reference to this Function's deep-copy to the map in case of
  // self-reference, e.g. self-reference in an Definition.

diff --git a/src/IR.cpp b/src/IR.cpp
@@ -690,6 +690,7 @@ const char *const intrinsic_op_names[] = {
  "widening_shift_left",
  "widening_shift_right",
  "widening_sub",
+ "get_runtime_vscale",
 };
 
 static_assert(sizeof(intrinsic_op_names) / sizeof(intrinsic_op_names[0]) == Call::IntrinsicOpCount,

diff --git a/src/IR.h b/src/IR.h
@@ -629,6 +629,8 @@ struct Call : public ExprNode<Call> {
  widening_shift_right,
  widening_sub,
 
+ get_runtime_vscale,
+
  IntrinsicOpCount // Sentinel: keep last.
  };
 

diff --git a/src/IRMatch.cpp b/src/IRMatch.cpp
@@ -262,6 +262,9 @@ class IRMatch : public IRVisitor {
  if (result && e && types_match(op->type, e->type)) {
  expr = e->value;
  op->value.accept(this);
+ } else if (op->lanes == 0 && types_match(op->value.type(), expr.type())) {
+ // zero lanes means any number of lanes, so match scalars too.
+ op->value.accept(this);
  } else {
  result = false;
  }

diff --git a/src/LLVM_Output.cpp b/src/LLVM_Output.cpp
@@ -331,6 +331,12 @@ std::unique_ptr<llvm::Module> clone_module(const llvm::Module &module_in) {
  // Read it back in.
  llvm::MemoryBufferRef buffer_ref(llvm::StringRef(clone_buffer.data(), clone_buffer.size()), "clone_buffer");
  auto cloned_module = llvm::parseBitcodeFile(buffer_ref, module_in.getContext());
+
+ // TODO(<add issue>): Add support for returning the error.
+ if (!cloned_module) {
+ llvm::dbgs() << cloned_module.takeError();
+ module_in.print(llvm::dbgs(), nullptr, false, true);
+ }
  internal_assert(cloned_module);
 
  return std::move(cloned_module.get());

diff --git a/src/StorageFolding.cpp b/src/StorageFolding.cpp
@@ -10,17 +10,14 @@
 #include "Monotonic.h"
 #include "Simplify.h"
 #include "Substitute.h"
+#include "Util.h"
 #include <utility>
 
 namespace Halide {
 namespace Internal {
 
 namespace {
 
-int64_t next_power_of_two(int64_t x) {
- return static_cast<int64_t>(1) << static_cast<int64_t>(std::ceil(std::log2(x)));
-}
-
 using std::map;
 using std::string;
 using std::vector;

diff --git a/src/Util.h b/src/Util.h
@@ -13,6 +13,7 @@
 /** \file
  * Various utility functions used internally Halide. */
 
+#include <cmath>
 #include <cstdint>
 #include <cstring>
 #include <functional>
@@ -532,6 +533,16 @@ int clz64(uint64_t x);
 int ctz64(uint64_t x);
 // @}
 
+/** Return an integer 2^n, for some n, which is >= x. Argument x must be > 0. */
+inline int64_t next_power_of_two(int64_t x) {
+ return static_cast<int64_t>(1) << static_cast<int64_t>(std::ceil(std::log2(x)));
+}
+
+template<typename T>
+inline T align_up(T x, int n) {
+ return (x + n - 1) / n * n;
+}
+
 } // namespace Internal
 } // namespace Halide
 

diff --git a/src/WasmExecutor.cpp b/src/WasmExecutor.cpp
@@ -101,11 +101,6 @@ struct debug_sink {
 // BDMalloc
 // ---------------------
 
-template<typename T>
-inline T align_up(T p, int alignment = 32) {
- return (p + alignment - 1) & ~(alignment - 1);
-}
-
 // Debugging our Malloc is extremely noisy and usually undesired
 
 #define BDMALLOC_DEBUG_LEVEL 0
@@ -318,7 +313,7 @@ std::vector<char> compile_to_wasm(const Module &module, const std::string &fn_na
  stack_size += cg->get_requested_alloca_total();
  }
 
- stack_size = align_up(stack_size);
+ stack_size = align_up(stack_size, 32);
  wdebug(1) << "Requesting stack size of " << stack_size << "\n";
 
  std::unique_ptr<llvm::Module> llvm_module =
@@ -708,7 +703,7 @@ wasm32_ptr_t hostbuf_to_wasmbuf(WabtContext &wabt_context, const halide_buffer_t
  const size_t dims_size_in_bytes = sizeof(halide_dimension_t) * src->dimensions;
  const size_t dims_offset = sizeof(wasm_halide_buffer_t);
  const size_t mem_needed_base = sizeof(wasm_halide_buffer_t) + dims_size_in_bytes;
- const size_t host_offset = align_up(mem_needed_base);
+ const size_t host_offset = align_up(mem_needed_base, 32);
  const size_t host_size_in_bytes = src->size_in_bytes();
  const size_t mem_needed = host_offset + host_size_in_bytes;
 
@@ -1613,7 +1608,7 @@ wasm32_ptr_t hostbuf_to_wasmbuf(const Local<Context> &context, const halide_buff
  const size_t dims_size_in_bytes = sizeof(halide_dimension_t) * src->dimensions;
  const size_t dims_offset = sizeof(wasm_halide_buffer_t);
  const size_t mem_needed_base = sizeof(wasm_halide_buffer_t) + dims_size_in_bytes;
- const size_t host_offset = align_up(mem_needed_base);
+ const size_t host_offset = align_up(mem_needed_base, 32);
  const size_t host_size_in_bytes = src->size_in_bytes();
  const size_t mem_needed = host_offset + host_size_in_bytes;
 

diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
@@ -1246,6 +1246,10 @@ enum halide_error_code_t {
  /** A factor used to split a loop was discovered to be zero or negative at
  * runtime. */
  halide_error_code_split_factor_not_positive = -46,
+
+ /** "vscale" value of Scalable Vector detected in runtime does not match
+ * the vscale value used in compilation. */
+ halide_error_code_vscale_invalid = -47,
 };
 
 /** Halide calls the functions below on various error conditions. The
@@ -1321,7 +1325,7 @@ extern int halide_error_storage_bound_too_small(void *user_context, const char *
  int provided_size, int required_size);
 extern int halide_error_device_crop_failed(void *user_context);
 extern int halide_error_split_factor_not_positive(void *user_context, const char *func_name, const char *orig, const char *outer, const char *inner, const char *factor_str, int factor);
-
+extern int halide_error_vscale_invalid(void *user_context, const char *func_name, int runtime_vscale, int compiletime_vscale);
 // @}
 
 /** Optional features a compilation Target can have.

diff --git a/src/runtime/aarch64.ll b/src/runtime/aarch64.ll
@@ -48,25 +48,34 @@ define weak_odr <2 x i64> @vabdl_u32x2(<2 x i32> %a, <2 x i32> %b) nounwind alwa
 
 declare <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> %x) nounwind readnone;
 declare <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> %x) nounwind readnone;
+declare float @llvm.aarch64.neon.frecpe.f32(float)
 declare <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %x) nounwind readnone;
 declare <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %x) nounwind readnone;
+declare float @llvm.aarch64.neon.frsqrte.f32(float)
 declare <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %x, <4 x float> %y) nounwind readnone;
 declare <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %x, <2 x float> %y) nounwind readnone;
+declare float @llvm.aarch64.neon.frecps.f32(float, float)
 declare <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %x, <4 x float> %y) nounwind readnone;
 declare <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %x, <2 x float> %y) nounwind readnone;
+declare float @llvm.aarch64.neon.frsqrts.f32(float, float)
+
 declare <8 x half> @llvm.aarch64.neon.frecpe.v8f16(<8 x half> %x) nounwind readnone;
 declare <4 x half> @llvm.aarch64.neon.frecpe.v4f16(<4 x half> %x) nounwind readnone;
+declare half @llvm.aarch64.neon.frecpe.f16(half)
 declare <8 x half> @llvm.aarch64.neon.frsqrte.v8f16(<8 x half> %x) nounwind readnone;
 declare <4 x half> @llvm.aarch64.neon.frsqrte.v4f16(<4 x half> %x) nounwind readnone;
+declare half @llvm.aarch64.neon.frsqrte.f16(half)
 declare <8 x half> @llvm.aarch64.neon.frecps.v8f16(<8 x half> %x, <8 x half> %y) nounwind readnone;
 declare <4 x half> @llvm.aarch64.neon.frecps.v4f16(<4 x half> %x, <4 x half> %y) nounwind readnone;
+declare half @llvm.aarch64.neon.frecps.f16(half, half)
 declare <8 x half> @llvm.aarch64.neon.frsqrts.v8f16(<8 x half> %x, <8 x half> %y) nounwind readnone;
 declare <4 x half> @llvm.aarch64.neon.frsqrts.v4f16(<4 x half> %x, <4 x half> %y) nounwind readnone;
+declare half @llvm.aarch64.neon.frsqrts.f16(half, half)
 
 define weak_odr float @fast_inverse_f32(float %x) nounwind alwaysinline {
- %vec = insertelement <2 x float> poison, float %x, i32 0
- %approx = tail call <2 x float> @fast_inverse_f32x2(<2 x float> %vec)
- %result = extractelement <2 x float> %approx, i32 0
+ %approx = tail call float @llvm.aarch64.neon.frecpe.f32(float %x)
+ %correction = tail call float @llvm.aarch64.neon.frecps.f32(float %approx, float %x)
+ %result = fmul float %approx, %correction
  ret float %result
 }
 
@@ -85,9 +94,9 @@ define weak_odr <4 x float> @fast_inverse_f32x4(<4 x float> %x) nounwind alwaysi
 }
 
 define weak_odr half @fast_inverse_f16(half %x) nounwind alwaysinline {
- %vec = insertelement <4 x half> poison, half %x, i32 0
- %approx = tail call <4 x half> @fast_inverse_f16x4(<4 x half> %vec)
- %result = extractelement <4 x half> %approx, i32 0
+ %approx = tail call half @llvm.aarch64.neon.frecpe.f16(half %x)
+ %correction = tail call half @llvm.aarch64.neon.frecps.f16(half %approx, half %x)
+ %result = fmul half %approx, %correction
  ret half %result
 }
 
@@ -106,9 +115,10 @@ define weak_odr <8 x half> @fast_inverse_f16x8(<8 x half> %x) nounwind alwaysinl
 }
 
 define weak_odr float @fast_inverse_sqrt_f32(float %x) nounwind alwaysinline {
- %vec = insertelement <2 x float> poison, float %x, i32 0
- %approx = tail call <2 x float> @fast_inverse_sqrt_f32x2(<2 x float> %vec)
- %result = extractelement <2 x float> %approx, i32 0
+ %approx = tail call float @llvm.aarch64.neon.frsqrte.f32(float %x)
+ %approx2 = fmul float %approx, %approx
+ %correction = tail call float @llvm.aarch64.neon.frsqrts.f32(float %approx2, float %x)
+ %result = fmul float %approx, %correction
  ret float %result
 }
 
@@ -129,9 +139,10 @@ define weak_odr <4 x float> @fast_inverse_sqrt_f32x4(<4 x float> %x) nounwind al
 }
 
 define weak_odr half @fast_inverse_sqrt_f16(half %x) nounwind alwaysinline {
- %vec = insertelement <4 x half> poison, half %x, i32 0
- %approx = tail call <4 x half> @fast_inverse_sqrt_f16x4(<4 x half> %vec)
- %result = extractelement <4 x half> %approx, i32 0
+ %approx = tail call half @llvm.aarch64.neon.frsqrte.f16(half %x)
+ %approx2 = fmul half %approx, %approx
+ %correction = tail call half @llvm.aarch64.neon.frsqrts.f16(half %approx2, half %x)
+ %result = fmul half %approx, %correction
  ret half %result
 }
 
@@ -149,4 +160,43 @@ define weak_odr <8 x half> @fast_inverse_sqrt_f16x8(<8 x half> %x) nounwind alwa
  %correction = tail call <8 x half> @llvm.aarch64.neon.frsqrts.v8f16(<8 x half> %approx2, <8 x half> %x)
  %result = fmul <8 x half> %approx, %correction
  ret <8 x half> %result
-}
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.frecpe.x.nxv4f32(<vscale x 4 x float> %x) nounwind readnone;
+declare <vscale x 4 x float> @llvm.aarch64.sve.frsqrte.x.nxv4f32(<vscale x 4 x float> %x) nounwind readnone;
+declare <vscale x 4 x float> @llvm.aarch64.sve.frecps.x.nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x float> %y) nounwind readnone;
+declare <vscale x 4 x float> @llvm.aarch64.sve.frsqrts.x.nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x float> %y) nounwind readnone;
+declare <vscale x 8 x half> @llvm.aarch64.sve.frecpe.x.nxv8f16(<vscale x 8 x half> %x) nounwind readnone;
+declare <vscale x 8 x half> @llvm.aarch64.sve.frsqrte.x.nxv8f16(<vscale x 8 x half> %x) nounwind readnone;
+declare <vscale x 8 x half> @llvm.aarch64.sve.frecps.x.nxv8f16(<vscale x 8 x half> %x, <vscale x 8 x half> %y) nounwind readnone;
+declare <vscale x 8 x half> @llvm.aarch64.sve.frsqrts.x.nxv8f16(<vscale x 8 x half> %x, <vscale x 8 x half> %y) nounwind readnone;
+
+define weak_odr <vscale x 4 x float> @fast_inverse_f32nx4(<vscale x 4 x float> %x) nounwind alwaysinline {
+ %approx = tail call <vscale x 4 x float> @llvm.aarch64.sve.frecpe.x.nxv4f32(<vscale x 4 x float> %x)
+ %correction = tail call <vscale x 4 x float> @llvm.aarch64.sve.frecps.x.nxv4f32(<vscale x 4 x float> %approx, <vscale x 4 x float> %x)
+ %result = fmul <vscale x 4 x float> %approx, %correction
+ ret <vscale x 4 x float> %result
+}
+
+define weak_odr <vscale x 8 x half> @fast_inverse_f16nx8(<vscale x 8 x half> %x) nounwind alwaysinline {
+ %approx = tail call <vscale x 8 x half> @llvm.aarch64.sve.frecpe.x.nxv8f16(<vscale x 8 x half> %x)
+ %correction = tail call <vscale x 8 x half> @llvm.aarch64.sve.frecps.x.nxv8f16(<vscale x 8 x half> %approx, <vscale x 8 x half> %x)
+ %result = fmul <vscale x 8 x half> %approx, %correction
+ ret <vscale x 8 x half> %result
+}
+
+define weak_odr <vscale x 4 x float> @fast_inverse_sqrt_f32nx4(<vscale x 4 x float> %x) nounwind alwaysinline {
+ %approx = tail call <vscale x 4 x float> @llvm.aarch64.sve.frsqrte.x.nxv4f32(<vscale x 4 x float> %x)
+ %approx2 = fmul <vscale x 4 x float> %approx, %approx
+ %correction = tail call <vscale x 4 x float> @llvm.aarch64.sve.frsqrts.x.nxv4f32(<vscale x 4 x float> %approx2, <vscale x 4 x float> %x)
+ %result = fmul <vscale x 4 x float> %approx, %correction
+ ret <vscale x 4 x float> %result
+}
+
+define weak_odr <vscale x 8 x half> @fast_inverse_sqrt_f16nx8(<vscale x 8 x half> %x) nounwind alwaysinline {
+ %approx = tail call <vscale x 8 x half> @llvm.aarch64.sve.frsqrte.x.nxv8f16(<vscale x 8 x half> %x)
+ %approx2 = fmul <vscale x 8 x half> %approx, %approx
+ %correction = tail call <vscale x 8 x half> @llvm.aarch64.sve.frsqrts.x.nxv8f16(<vscale x 8 x half> %approx2, <vscale x 8 x half> %x)
+ %result = fmul <vscale x 8 x half> %approx, %correction
+ ret <vscale x 8 x half> %result
+}
diff --git a/src/runtime/errors.cpp b/src/runtime/errors.cpp
@@ -300,4 +300,12 @@ WEAK int halide_error_split_factor_not_positive(void *user_context, const char *
  return halide_error_code_split_factor_not_positive;
 }
 
+WEAK int halide_error_vscale_invalid(void *user_context, const char *func_name, int runtime_vscale, int compiletime_vscale) {
+ error(user_context)
+ << "The function " << func_name
+ << " is compiled with the assumption that vscale of Scalable Vector is " << compiletime_vscale
+ << ". However, the detected runtime vscale is " << runtime_vscale << ".";
+ return halide_error_code_vscale_invalid;
+}
+
 } // extern "C"
diff --git a/src/runtime/posix_math.ll b/src/runtime/posix_math.ll
@@ -322,4 +322,30 @@ define weak_odr double @neg_inf_f64() nounwind uwtable readnone alwaysinline {
 
 define weak_odr double @nan_f64() nounwind uwtable readnone alwaysinline {
  ret double 0x7FF8000000000000
-}
+}
+
+; In case scalable vector with un-natural vector size, LLVM doesn't auto-vectorize the above scalar version
+define weak_odr <vscale x 4 x float> @inf_f32nx4() nounwind uwtable readnone alwaysinline {
+ ret <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> undef, float 0x7FF0000000000000, i32 0), <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer)
+}
+
+define weak_odr <vscale x 4 x float> @neg_inf_f32nx4() nounwind uwtable readnone alwaysinline {
+ ret <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> undef, float 0xFFF0000000000000, i32 0), <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer)
+}
+
+define weak_odr <vscale x 4 x float> @nan_f32nx4() nounwind uwtable readnone alwaysinline {
+ ret <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> undef, float 0x7FF8000000000000, i32 0), <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer)
+}
+
+
+define weak_odr <vscale x 2 x double> @inf_f64nx2() nounwind uwtable readnone alwaysinline {
+ ret <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> undef, double 0x7FF0000000000000, i32 0), <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer)
+}
+
+define weak_odr <vscale x 2 x double> @neg_inf_f64nx2() nounwind uwtable readnone alwaysinline {
+ ret <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> undef, double 0xFFF0000000000000, i32 0), <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer)
+}
+
+define weak_odr <vscale x 2 x double> @nan_f64nx2() nounwind uwtable readnone alwaysinline {
+ ret <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> undef, double 0x7FF8000000000000, i32 0), <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer)
+}
diff --git a/src/runtime/runtime_api.cpp b/src/runtime/runtime_api.cpp
@@ -89,6 +89,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = {
  (void *)&halide_error_unaligned_host_ptr,
  (void *)&halide_error_storage_bound_too_small,
  (void *)&halide_error_device_crop_failed,
+ (void *)&halide_error_vscale_invalid,
  (void *)&halide_float16_bits_to_double,
  (void *)&halide_float16_bits_to_float,
  (void *)&halide_free,

diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
@@ -277,6 +277,7 @@ tests(GROUPS correctness
  simd_op_check_hvx.cpp
  simd_op_check_powerpc.cpp
  simd_op_check_riscv.cpp
+ simd_op_check_sve2.cpp
  simd_op_check_wasm.cpp
  simd_op_check_x86.cpp
  simplified_away_embedded_image.cpp

diff --git a/test/correctness/simd_op_check_arm.cpp b/test/correctness/simd_op_check_arm.cpp
@@ -230,6 +230,13 @@ class SimdOpCheckARM : public SimdOpCheckTest {
  check(arm32 ? "vcvt.s32.f32" : "fcvtzs", 2 * w, i32(f32_1));
  // skip the fixed point conversions for now
 
+ if (!arm32) {
+ check("fcvtmu *v", 2 * w, u32(floor(f32_1)));
+ check("fcvtpu *v", 2 * w, u32(ceil(f32_1)));
+ check("fcvtms *v", 2 * w, i32(floor(f32_1)));
+ check("fcvtps *v", 2 * w, i32(ceil(f32_1)));
+ }
+
  // VDIV - F, D Divide
  // This doesn't actually get vectorized in 32-bit. Not sure cortex processors can do vectorized division.
  check(arm32 ? "vdiv.f32" : "fdiv", 2 * w, f32_1 / f32_2);