From dcd3168c97c6024b24f80abbb2407c81d9b0a25a Mon Sep 17 00:00:00 2001
From: beetrees <b@beetr.ee>
Date: Fri, 23 May 2025 15:51:51 +0100
Subject: [PATCH 1/8] Use correct sign extension on `__powi*f2` arguments

---
 src/abi/mod.rs        | 28 ++++++++++++++++++++++++++++
 src/intrinsics/mod.rs |  3 ++-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/src/abi/mod.rs b/src/abi/mod.rs
index 70d29596e..5f7bf3821 100644
--- a/src/abi/mod.rs
+++ b/src/abi/mod.rs
@@ -828,3 +828,31 @@ pub(crate) fn codegen_drop<'tcx>(
         }
     }
 }
+
+pub(crate) fn lib_call_arg_param(tcx: TyCtxt<'_>, ty: Type, is_signed: bool) -> AbiParam {
+    let param = AbiParam::new(ty);
+    if ty.is_int() && u64::from(ty.bits()) < tcx.data_layout.pointer_size.bits() {
+        match (&*tcx.sess.target.arch, &*tcx.sess.target.vendor) {
+            ("x86_64", _) | ("aarch64", "apple") => match (ty, is_signed) {
+                (types::I8 | types::I16, true) => param.sext(),
+                (types::I8 | types::I16, false) => param.uext(),
+                _ => param,
+            },
+            ("aarch64", _) => param,
+            ("riscv64", _) => match (ty, is_signed) {
+                (types::I32, _) | (_, true) => param.sext(),
+                _ => param.uext(),
+            },
+            ("s390x", _) => {
+                if is_signed {
+                    param.sext()
+                } else {
+                    param.uext()
+                }
+            }
+            _ => unimplemented!("{:?}", tcx.sess.target.arch),
+        }
+    } else {
+        param
+    }
+}
diff --git a/src/intrinsics/mod.rs b/src/intrinsics/mod.rs
index 25e224ebf..7c18922d9 100644
--- a/src/intrinsics/mod.rs
+++ b/src/intrinsics/mod.rs
@@ -416,7 +416,8 @@ fn codegen_float_intrinsic_call<'tcx>(
         // These intrinsics aren't supported natively by Cranelift.
         // Lower them to a libcall.
         sym::powif32 | sym::powif64 => {
-            let input_tys: Vec<_> = vec![AbiParam::new(clif_ty), AbiParam::new(types::I32)];
+            let input_tys: Vec<_> =
+                vec![AbiParam::new(clif_ty), lib_call_arg_param(fx.tcx, types::I32, true)];
             let ret_val = fx.lib_call(name, input_tys, vec![AbiParam::new(clif_ty)], &args)[0];
             CValue::by_val(ret_val, fx.layout_of(ty))
         }

From f0fb19ccc87e18402079fbe3106d8090cc8d9f64 Mon Sep 17 00:00:00 2001
From: beetrees <b@beetr.ee>
Date: Sat, 24 May 2025 13:45:12 +0100
Subject: [PATCH 2/8] Add missing float libcalls to `compiler_builtins.rs`

---
 src/compiler_builtins.rs | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/src/compiler_builtins.rs b/src/compiler_builtins.rs
index 5b6d0ef6d..8886317c1 100644
--- a/src/compiler_builtins.rs
+++ b/src/compiler_builtins.rs
@@ -46,15 +46,49 @@ builtin_functions! {
     fn __rust_u128_mulo(a: u128, b: u128, oflow: &mut i32) -> u128;
     fn __rust_i128_mulo(a: i128, b: i128, oflow: &mut i32) -> i128;
 
-    // floats
+    // integer -> float
     fn __floattisf(i: i128) -> f32;
     fn __floattidf(i: i128) -> f64;
     fn __floatuntisf(i: u128) -> f32;
     fn __floatuntidf(i: u128) -> f64;
+    // float -> integer
     fn __fixsfti(f: f32) -> i128;
     fn __fixdfti(f: f64) -> i128;
     fn __fixunssfti(f: f32) -> u128;
     fn __fixunsdfti(f: f64) -> u128;
+    // float binops
+    fn fmodf(a: f32, b: f32) -> f32;
+    fn fmod(a: f64, b: f64) -> f64;
+    // Cranelift float libcalls
+    fn fmaf(a: f32, b: f32, c: f32) -> f32;
+    fn fma(a: f64, b: f64, c: f64) -> f64;
+    fn floorf(f: f32) -> f32;
+    fn floor(f: f64) -> f64;
+    fn ceilf(f: f32) -> f32;
+    fn ceil(f: f64) -> f64;
+    fn truncf(f: f32) -> f32;
+    fn trunc(f: f64) -> f64;
+    fn nearbyintf(f: f32) -> f32;
+    fn nearbyint(f: f64) -> f64;
+    // float intrinsics
+    fn __powisf2(a: f32, b: i32) -> f32;
+    fn __powidf2(a: f64, b: i32) -> f64;
+    fn powf(a: f32, b: f32) -> f32;
+    fn pow(a: f64, b: f64) -> f64;
+    fn expf(f: f32) -> f32;
+    fn exp(f: f64) -> f64;
+    fn exp2f(f: f32) -> f32;
+    fn exp2(f: f64) -> f64;
+    fn logf(f: f32) -> f32;
+    fn log(f: f64) -> f64;
+    fn log2f(f: f32) -> f32;
+    fn log2(f: f64) -> f64;
+    fn log10f(f: f32) -> f32;
+    fn log10(f: f64) -> f64;
+    fn sinf(f: f32) -> f32;
+    fn sin(f: f64) -> f64;
+    fn cosf(f: f32) -> f32;
+    fn cos(f: f64) -> f64;
 
     // allocator
     // NOTE: These need to be mentioned here despite not being part of compiler_builtins because

From 87c425b9e1ce015c4e3dad7729e8d54bc7b6b27b Mon Sep 17 00:00:00 2001
From: beetrees <b@beetr.ee>
Date: Fri, 23 May 2025 15:51:51 +0100
Subject: [PATCH 3/8] Add basic support for `f16`/`f128` values

---
 src/abi/pass_mode.rs   |  2 ++
 src/common.rs          |  8 ++++----
 src/value_and_place.rs | 14 +++++++++++++-
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/src/abi/pass_mode.rs b/src/abi/pass_mode.rs
index 06d89bc9e..6d8614aca 100644
--- a/src/abi/pass_mode.rs
+++ b/src/abi/pass_mode.rs
@@ -22,8 +22,10 @@ fn reg_to_abi_param(reg: Reg) -> AbiParam {
         (RegKind::Integer, 3..=4) => types::I32,
         (RegKind::Integer, 5..=8) => types::I64,
         (RegKind::Integer, 9..=16) => types::I128,
+        (RegKind::Float, 2) => types::F16,
         (RegKind::Float, 4) => types::F32,
         (RegKind::Float, 8) => types::F64,
+        (RegKind::Float, 16) => types::F128,
         (RegKind::Vector, size) => types::I8.by(u32::try_from(size).unwrap()).unwrap(),
         _ => unreachable!("{:?}", reg),
     };
diff --git a/src/common.rs b/src/common.rs
index abe2972ba..2f11b2d2d 100644
--- a/src/common.rs
+++ b/src/common.rs
@@ -33,10 +33,10 @@ pub(crate) fn scalar_to_clif_type(tcx: TyCtxt<'_>, scalar: Scalar) -> Type {
             Integer::I128 => types::I128,
         },
         Primitive::Float(float) => match float {
-            Float::F16 => unimplemented!("f16_f128"),
+            Float::F16 => types::F16,
             Float::F32 => types::F32,
             Float::F64 => types::F64,
-            Float::F128 => unimplemented!("f16_f128"),
+            Float::F128 => types::F128,
         },
         // FIXME(erikdesjardins): handle non-default addrspace ptr sizes
         Primitive::Pointer(_) => pointer_ty(tcx),
@@ -64,10 +64,10 @@ fn clif_type_from_ty<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> Option<types::Typ
         },
         ty::Char => types::I32,
         ty::Float(size) => match size {
-            FloatTy::F16 => unimplemented!("f16_f128"),
+            FloatTy::F16 => types::F16,
             FloatTy::F32 => types::F32,
             FloatTy::F64 => types::F64,
-            FloatTy::F128 => unimplemented!("f16_f128"),
+            FloatTy::F128 => types::F128,
         },
         ty::FnPtr(..) => pointer_ty(tcx),
         ty::RawPtr(pointee_ty, _) | ty::Ref(_, pointee_ty, _) => {
diff --git a/src/value_and_place.rs b/src/value_and_place.rs
index 0938c9905..cbfb215a8 100644
--- a/src/value_and_place.rs
+++ b/src/value_and_place.rs
@@ -325,7 +325,7 @@ impl<'tcx> CValue<'tcx> {
         const_val: ty::ScalarInt,
     ) -> CValue<'tcx> {
         assert_eq!(const_val.size(), layout.size, "{:#?}: {:?}", const_val, layout);
-        use cranelift_codegen::ir::immediates::{Ieee32, Ieee64};
+        use cranelift_codegen::ir::immediates::{Ieee16, Ieee32, Ieee64, Ieee128};
 
         let clif_ty = fx.clif_type(layout.ty).unwrap();
 
@@ -346,12 +346,24 @@ impl<'tcx> CValue<'tcx> {
                 let raw_val = const_val.size().truncate(const_val.to_bits(layout.size));
                 fx.bcx.ins().iconst(clif_ty, raw_val as i64)
             }
+            ty::Float(FloatTy::F16) => {
+                fx.bcx.ins().f16const(Ieee16::with_bits(u16::try_from(const_val).unwrap()))
+            }
             ty::Float(FloatTy::F32) => {
                 fx.bcx.ins().f32const(Ieee32::with_bits(u32::try_from(const_val).unwrap()))
             }
             ty::Float(FloatTy::F64) => {
                 fx.bcx.ins().f64const(Ieee64::with_bits(u64::try_from(const_val).unwrap()))
             }
+            ty::Float(FloatTy::F128) => {
+                let value = fx
+                    .bcx
+                    .func
+                    .dfg
+                    .constants
+                    .insert(Ieee128::with_bits(u128::try_from(const_val).unwrap()).into());
+                fx.bcx.ins().f128const(value)
+            }
             _ => panic!(
                 "CValue::const_val for non bool/char/float/integer/pointer type {:?} is not allowed",
                 layout.ty

From 27a9590d3db284a12143dafd03bf7343b59b6808 Mon Sep 17 00:00:00 2001
From: beetrees <b@beetr.ee>
Date: Fri, 23 May 2025 15:51:51 +0100
Subject: [PATCH 4/8] Add `f16`/`f128` `+`/`-`/`*`/`/`/`%` support

---
 build_system/build_backend.rs |  2 +-
 src/base.rs                   | 11 +++++-
 src/codegen_f16_f128.rs       | 64 +++++++++++++++++++++++++++++++++++
 src/compiler_builtins.rs      |  8 +++++
 src/lib.rs                    |  4 +++
 src/num.rs                    | 53 ++++++++++++++++++++++++++---
 6 files changed, 135 insertions(+), 7 deletions(-)
 create mode 100644 src/codegen_f16_f128.rs

diff --git a/build_system/build_backend.rs b/build_system/build_backend.rs
index 72bc42252..bf7cf1c0a 100644
--- a/build_system/build_backend.rs
+++ b/build_system/build_backend.rs
@@ -18,7 +18,7 @@ pub(crate) fn build_backend(
     let mut cmd = CG_CLIF.build(&bootstrap_host_compiler, dirs);
 
     let mut rustflags = rustflags_from_env("RUSTFLAGS");
-    rustflags.push("-Zallow-features=rustc_private".to_owned());
+    rustflags.push("-Zallow-features=rustc_private,f16,f128".to_owned());
     rustflags_to_cmd_env(&mut cmd, "RUSTFLAGS", &rustflags);
 
     if env::var("CG_CLIF_EXPENSIVE_CHECKS").is_ok() {
diff --git a/src/base.rs b/src/base.rs
index c1f4c065c..461730410 100644
--- a/src/base.rs
+++ b/src/base.rs
@@ -15,9 +15,9 @@ use rustc_middle::ty::print::with_no_trimmed_paths;
 
 use crate::constant::ConstantCx;
 use crate::debuginfo::{FunctionDebugContext, TypeDebugContext};
-use crate::enable_verifier;
 use crate::prelude::*;
 use crate::pretty_clif::CommentWriter;
+use crate::{codegen_f16_f128, enable_verifier};
 
 pub(crate) struct CodegenedFunction {
     symbol_name: String,
@@ -635,6 +635,15 @@ fn codegen_stmt<'tcx>(fx: &mut FunctionCx<'_, '_, 'tcx>, cur_block: Block, stmt:
                             let val = operand.load_scalar(fx);
                             match layout.ty.kind() {
                                 ty::Int(_) => CValue::by_val(fx.bcx.ins().ineg(val), layout),
+                                // FIXME(bytecodealliance/wasmtime#8312): Remove
+                                // once backend lowerings have been added to
+                                // Cranelift.
+                                ty::Float(FloatTy::F16) => {
+                                    CValue::by_val(codegen_f16_f128::neg_f16(fx, val), layout)
+                                }
+                                ty::Float(FloatTy::F128) => {
+                                    CValue::by_val(codegen_f16_f128::neg_f128(fx, val), layout)
+                                }
                                 ty::Float(_) => CValue::by_val(fx.bcx.ins().fneg(val), layout),
                                 _ => unreachable!("un op Neg for {:?}", layout.ty),
                             }
diff --git a/src/codegen_f16_f128.rs b/src/codegen_f16_f128.rs
new file mode 100644
index 000000000..c6f0779b8
--- /dev/null
+++ b/src/codegen_f16_f128.rs
@@ -0,0 +1,64 @@
+use crate::prelude::*;
+
+pub(crate) fn f16_to_f32(fx: &mut FunctionCx<'_, '_, '_>, value: Value) -> Value {
+    let (value, arg_ty) =
+        if fx.tcx.sess.target.vendor == "apple" && fx.tcx.sess.target.arch == "x86_64" {
+            (
+                fx.bcx.ins().bitcast(types::I16, MemFlags::new(), value),
+                lib_call_arg_param(fx.tcx, types::I16, false),
+            )
+        } else {
+            (value, AbiParam::new(types::F16))
+        };
+    fx.lib_call("__extendhfsf2", vec![arg_ty], vec![AbiParam::new(types::F32)], &[value])[0]
+}
+
+pub(crate) fn f32_to_f16(fx: &mut FunctionCx<'_, '_, '_>, value: Value) -> Value {
+    let ret_ty = if fx.tcx.sess.target.vendor == "apple" && fx.tcx.sess.target.arch == "x86_64" {
+        types::I16
+    } else {
+        types::F16
+    };
+    let ret = fx.lib_call(
+        "__truncsfhf2",
+        vec![AbiParam::new(types::F32)],
+        vec![AbiParam::new(ret_ty)],
+        &[value],
+    )[0];
+    if ret_ty == types::I16 { fx.bcx.ins().bitcast(types::F16, MemFlags::new(), ret) } else { ret }
+}
+
+pub(crate) fn codegen_f128_binop(
+    fx: &mut FunctionCx<'_, '_, '_>,
+    bin_op: BinOp,
+    lhs: Value,
+    rhs: Value,
+) -> Value {
+    let name = match bin_op {
+        BinOp::Add => "__addtf3",
+        BinOp::Sub => "__subtf3",
+        BinOp::Mul => "__multf3",
+        BinOp::Div => "__divtf3",
+        _ => unreachable!("handled in `codegen_float_binop`"),
+    };
+    fx.lib_call(
+        name,
+        vec![AbiParam::new(types::F128), AbiParam::new(types::F128)],
+        vec![AbiParam::new(types::F128)],
+        &[lhs, rhs],
+    )[0]
+}
+
+pub(crate) fn neg_f16(fx: &mut FunctionCx<'_, '_, '_>, value: Value) -> Value {
+    let bits = fx.bcx.ins().bitcast(types::I16, MemFlags::new(), value);
+    let bits = fx.bcx.ins().bxor_imm(bits, 0x8000);
+    fx.bcx.ins().bitcast(types::F16, MemFlags::new(), bits)
+}
+
+pub(crate) fn neg_f128(fx: &mut FunctionCx<'_, '_, '_>, value: Value) -> Value {
+    let bits = fx.bcx.ins().bitcast(types::I128, MemFlags::new(), value);
+    let (low, high) = fx.bcx.ins().isplit(bits);
+    let high = fx.bcx.ins().bxor_imm(high, 0x8000_0000_0000_0000_u64 as i64);
+    let bits = fx.bcx.ins().iconcat(low, high);
+    fx.bcx.ins().bitcast(types::F128, MemFlags::new(), bits)
+}
diff --git a/src/compiler_builtins.rs b/src/compiler_builtins.rs
index 8886317c1..0c75df845 100644
--- a/src/compiler_builtins.rs
+++ b/src/compiler_builtins.rs
@@ -56,9 +56,17 @@ builtin_functions! {
     fn __fixdfti(f: f64) -> i128;
     fn __fixunssfti(f: f32) -> u128;
     fn __fixunsdfti(f: f64) -> u128;
+    // float -> float
+    fn __extendhfsf2(f: f16) -> f32;
+    fn __truncsfhf2(f: f32) -> f16;
     // float binops
+    fn __addtf3(a: f128, b: f128) -> f128;
+    fn __subtf3(a: f128, b: f128) -> f128;
+    fn __multf3(a: f128, b: f128) -> f128;
+    fn __divtf3(a: f128, b: f128) -> f128;
     fn fmodf(a: f32, b: f32) -> f32;
     fn fmod(a: f64, b: f64) -> f64;
+    fn fmodf128(a: f128, b: f128) -> f128;
     // Cranelift float libcalls
     fn fmaf(a: f32, b: f32, c: f32) -> f32;
     fn fma(a: f64, b: f64, c: f64) -> f64;
diff --git a/src/lib.rs b/src/lib.rs
index d41a80819..03dfd4956 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -6,6 +6,9 @@
 #![cfg_attr(doc, feature(rustdoc_internals))]
 // Note: please avoid adding other feature gates where possible
 #![feature(rustc_private)]
+// Only used to define intrinsics in `compiler_builtins.rs`.
+#![feature(f16)]
+#![feature(f128)]
 // Note: please avoid adding other feature gates where possible
 #![warn(rust_2018_idioms)]
 #![warn(unreachable_pub)]
@@ -57,6 +60,7 @@ mod allocator;
 mod analyze;
 mod base;
 mod cast;
+mod codegen_f16_f128;
 mod codegen_i128;
 mod common;
 mod compiler_builtins;
diff --git a/src/num.rs b/src/num.rs
index 90627f806..3ed276267 100644
--- a/src/num.rs
+++ b/src/num.rs
@@ -1,5 +1,6 @@
 //! Various operations on integer and floating-point numbers
 
+use crate::codegen_f16_f128;
 use crate::prelude::*;
 
 fn bin_op_to_intcc(bin_op: BinOp, signed: bool) -> IntCC {
@@ -350,25 +351,60 @@ pub(crate) fn codegen_float_binop<'tcx>(
     let lhs = in_lhs.load_scalar(fx);
     let rhs = in_rhs.load_scalar(fx);
 
+    // FIXME(bytecodealliance/wasmtime#8312): Remove once backend lowerings have
+    // been added to Cranelift.
+    let (lhs, rhs) = if *in_lhs.layout().ty.kind() == ty::Float(FloatTy::F16) {
+        (codegen_f16_f128::f16_to_f32(fx, lhs), codegen_f16_f128::f16_to_f32(fx, rhs))
+    } else {
+        (lhs, rhs)
+    };
     let b = fx.bcx.ins();
     let res = match bin_op {
+        // FIXME(bytecodealliance/wasmtime#8312): Remove once backend lowerings
+        // have been added to Cranelift.
+        BinOp::Add | BinOp::Sub | BinOp::Mul | BinOp::Div
+            if *in_lhs.layout().ty.kind() == ty::Float(FloatTy::F128) =>
+        {
+            codegen_f16_f128::codegen_f128_binop(fx, bin_op, lhs, rhs)
+        }
         BinOp::Add => b.fadd(lhs, rhs),
         BinOp::Sub => b.fsub(lhs, rhs),
         BinOp::Mul => b.fmul(lhs, rhs),
         BinOp::Div => b.fdiv(lhs, rhs),
         BinOp::Rem => {
-            let (name, ty) = match in_lhs.layout().ty.kind() {
-                ty::Float(FloatTy::F32) => ("fmodf", types::F32),
-                ty::Float(FloatTy::F64) => ("fmod", types::F64),
+            let (name, ty, lhs, rhs) = match in_lhs.layout().ty.kind() {
+                ty::Float(FloatTy::F16) => (
+                    "fmodf",
+                    types::F32,
+                    // FIXME(bytecodealliance/wasmtime#8312): Already converted
+                    // by the FIXME above.
+                    // fx.bcx.ins().fpromote(types::F32, lhs),
+                    // fx.bcx.ins().fpromote(types::F32, rhs),
+                    lhs,
+                    rhs,
+                ),
+                ty::Float(FloatTy::F32) => ("fmodf", types::F32, lhs, rhs),
+                ty::Float(FloatTy::F64) => ("fmod", types::F64, lhs, rhs),
+                ty::Float(FloatTy::F128) => ("fmodf128", types::F128, lhs, rhs),
                 _ => bug!(),
             };
 
-            fx.lib_call(
+            let ret_val = fx.lib_call(
                 name,
                 vec![AbiParam::new(ty), AbiParam::new(ty)],
                 vec![AbiParam::new(ty)],
                 &[lhs, rhs],
-            )[0]
+            )[0];
+
+            let ret_val = if *in_lhs.layout().ty.kind() == ty::Float(FloatTy::F16) {
+                // FIXME(bytecodealliance/wasmtime#8312): Use native Cranelift
+                // operation once Cranelift backend lowerings have been
+                // implemented.
+                codegen_f16_f128::f32_to_f16(fx, ret_val)
+            } else {
+                ret_val
+            };
+            return CValue::by_val(ret_val, in_lhs.layout());
         }
         BinOp::Eq | BinOp::Lt | BinOp::Le | BinOp::Ne | BinOp::Ge | BinOp::Gt => {
             let fltcc = match bin_op {
@@ -386,6 +422,13 @@ pub(crate) fn codegen_float_binop<'tcx>(
         _ => unreachable!("{:?}({:?}, {:?})", bin_op, in_lhs, in_rhs),
     };
 
+    // FIXME(bytecodealliance/wasmtime#8312): Remove once backend lowerings have
+    // been added to Cranelift.
+    let res = if *in_lhs.layout().ty.kind() == ty::Float(FloatTy::F16) {
+        codegen_f16_f128::f32_to_f16(fx, res)
+    } else {
+        res
+    };
     CValue::by_val(res, in_lhs.layout())
 }
 

From 38d48dbe083b680b54e39c2ead2a92d76001786c Mon Sep 17 00:00:00 2001
From: beetrees <b@beetr.ee>
Date: Fri, 23 May 2025 15:51:51 +0100
Subject: [PATCH 5/8] Add `f16`/`f128` comparison support

---
 src/codegen_f16_f128.rs  | 54 +++++++++++++++++++++++++
 src/compiler_builtins.rs |  9 +++++
 src/intrinsics/mod.rs    | 87 ++++++++++++++++++++++++++++++++++++++++
 src/num.rs               | 17 +++++---
 4 files changed, 162 insertions(+), 5 deletions(-)

diff --git a/src/codegen_f16_f128.rs b/src/codegen_f16_f128.rs
index c6f0779b8..c570fbbd9 100644
--- a/src/codegen_f16_f128.rs
+++ b/src/codegen_f16_f128.rs
@@ -28,6 +28,42 @@ pub(crate) fn f32_to_f16(fx: &mut FunctionCx<'_, '_, '_>, value: Value) -> Value
     if ret_ty == types::I16 { fx.bcx.ins().bitcast(types::F16, MemFlags::new(), ret) } else { ret }
 }
 
+pub(crate) fn fcmp(fx: &mut FunctionCx<'_, '_, '_>, cc: FloatCC, lhs: Value, rhs: Value) -> Value {
+    let ty = fx.bcx.func.dfg.value_type(lhs);
+    match ty {
+        types::F32 | types::F64 => fx.bcx.ins().fcmp(cc, lhs, rhs),
+        types::F16 => {
+            let lhs = f16_to_f32(fx, lhs);
+            let rhs = f16_to_f32(fx, rhs);
+            fx.bcx.ins().fcmp(cc, lhs, rhs)
+        }
+        types::F128 => {
+            let (name, int_cc) = match cc {
+                FloatCC::Equal => ("__eqtf2", IntCC::Equal),
+                FloatCC::NotEqual => ("__netf2", IntCC::NotEqual),
+                FloatCC::LessThan => ("__lttf2", IntCC::SignedLessThan),
+                FloatCC::LessThanOrEqual => ("__letf2", IntCC::SignedLessThanOrEqual),
+                FloatCC::GreaterThan => ("__gttf2", IntCC::SignedGreaterThan),
+                FloatCC::GreaterThanOrEqual => ("__getf2", IntCC::SignedGreaterThanOrEqual),
+                _ => unreachable!("not currently used in rustc_codegen_cranelift: {cc:?}"),
+            };
+            let res = fx.lib_call(
+                name,
+                vec![AbiParam::new(types::F128), AbiParam::new(types::F128)],
+                // FIXME(rust-lang/compiler-builtins#919): This should be `I64` on non-AArch64
+                // architectures, but switching it before compiler-builtins is fixed causes test
+                // failures.
+                vec![AbiParam::new(types::I32)],
+                &[lhs, rhs],
+            )[0];
+            let zero = fx.bcx.ins().iconst(types::I32, 0);
+            let res = fx.bcx.ins().icmp(int_cc, res, zero);
+            res
+        }
+        _ => unreachable!("{ty:?}"),
+    }
+}
+
 pub(crate) fn codegen_f128_binop(
     fx: &mut FunctionCx<'_, '_, '_>,
     bin_op: BinOp,
@@ -62,3 +98,21 @@ pub(crate) fn neg_f128(fx: &mut FunctionCx<'_, '_, '_>, value: Value) -> Value {
     let bits = fx.bcx.ins().iconcat(low, high);
     fx.bcx.ins().bitcast(types::F128, MemFlags::new(), bits)
 }
+
+pub(crate) fn fmin_f128(fx: &mut FunctionCx<'_, '_, '_>, a: Value, b: Value) -> Value {
+    fx.lib_call(
+        "fminimumf128",
+        vec![AbiParam::new(types::F128), AbiParam::new(types::F128)],
+        vec![AbiParam::new(types::F128)],
+        &[a, b],
+    )[0]
+}
+
+pub(crate) fn fmax_f128(fx: &mut FunctionCx<'_, '_, '_>, a: Value, b: Value) -> Value {
+    fx.lib_call(
+        "fmaximumf128",
+        vec![AbiParam::new(types::F128), AbiParam::new(types::F128)],
+        vec![AbiParam::new(types::F128)],
+        &[a, b],
+    )[0]
+}
diff --git a/src/compiler_builtins.rs b/src/compiler_builtins.rs
index 0c75df845..017a1370a 100644
--- a/src/compiler_builtins.rs
+++ b/src/compiler_builtins.rs
@@ -67,6 +67,15 @@ builtin_functions! {
     fn fmodf(a: f32, b: f32) -> f32;
     fn fmod(a: f64, b: f64) -> f64;
     fn fmodf128(a: f128, b: f128) -> f128;
+    // float comparison
+    fn __eqtf2(a: f128, b: f128) -> i32;
+    fn __netf2(a: f128, b: f128) -> i32;
+    fn __lttf2(a: f128, b: f128) -> i32;
+    fn __letf2(a: f128, b: f128) -> i32;
+    fn __gttf2(a: f128, b: f128) -> i32;
+    fn __getf2(a: f128, b: f128) -> i32;
+    fn fminimumf128(a: f128, b: f128) -> f128;
+    fn fmaximumf128(a: f128, b: f128) -> f128;
     // Cranelift float libcalls
     fn fmaf(a: f32, b: f32, c: f32) -> f32;
     fn fma(a: f64, b: f64, c: f64) -> f64;
diff --git a/src/intrinsics/mod.rs b/src/intrinsics/mod.rs
index 7c18922d9..6481d2112 100644
--- a/src/intrinsics/mod.rs
+++ b/src/intrinsics/mod.rs
@@ -27,6 +27,7 @@ use rustc_span::{Symbol, sym};
 
 pub(crate) use self::llvm::codegen_llvm_intrinsic_call;
 use crate::cast::clif_intcast;
+use crate::codegen_f16_f128;
 use crate::prelude::*;
 
 fn bug_on_incorrect_arg_count(intrinsic: impl std::fmt::Display) -> ! {
@@ -1118,6 +1119,20 @@ fn codegen_regular_intrinsic_call<'tcx>(
             ret.write_cvalue(fx, old);
         }
 
+        sym::minimumf16 => {
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
+
+            // FIXME(bytecodealliance/wasmtime#8312): Use `fmin` directly once
+            // Cranelift backend lowerings are implemented.
+            let a = codegen_f16_f128::f16_to_f32(fx, a);
+            let b = codegen_f16_f128::f16_to_f32(fx, b);
+            let val = fx.bcx.ins().fmin(a, b);
+            let val = codegen_f16_f128::f32_to_f16(fx, val);
+            let val = CValue::by_val(val, fx.layout_of(fx.tcx.types.f16));
+            ret.write_cvalue(fx, val);
+        }
         sym::minimumf32 => {
             intrinsic_args!(fx, args => (a, b); intrinsic);
             let a = a.load_scalar(fx);
@@ -1136,6 +1151,31 @@ fn codegen_regular_intrinsic_call<'tcx>(
             let val = CValue::by_val(val, fx.layout_of(fx.tcx.types.f64));
             ret.write_cvalue(fx, val);
         }
+        sym::minimumf128 => {
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
+
+            // FIXME(bytecodealliance/wasmtime#8312): Use `fmin` once  Cranelift
+            // backend lowerings are implemented.
+            let val = codegen_f16_f128::fmin_f128(fx, a, b);
+            let val = CValue::by_val(val, fx.layout_of(fx.tcx.types.f128));
+            ret.write_cvalue(fx, val);
+        }
+        sym::maximumf16 => {
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
+
+            // FIXME(bytecodealliance/wasmtime#8312): Use `fmax` directly once
+            // Cranelift backend lowerings are implemented.
+            let a = codegen_f16_f128::f16_to_f32(fx, a);
+            let b = codegen_f16_f128::f16_to_f32(fx, b);
+            let val = fx.bcx.ins().fmax(a, b);
+            let val = codegen_f16_f128::f32_to_f16(fx, val);
+            let val = CValue::by_val(val, fx.layout_of(fx.tcx.types.f16));
+            ret.write_cvalue(fx, val);
+        }
         sym::maximumf32 => {
             intrinsic_args!(fx, args => (a, b); intrinsic);
             let a = a.load_scalar(fx);
@@ -1154,7 +1194,27 @@ fn codegen_regular_intrinsic_call<'tcx>(
             let val = CValue::by_val(val, fx.layout_of(fx.tcx.types.f64));
             ret.write_cvalue(fx, val);
         }
+        sym::maximumf128 => {
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
+
+            // FIXME(bytecodealliance/wasmtime#8312): Use `fmax` once  Cranelift
+            // backend lowerings are implemented.
+            let val = codegen_f16_f128::fmax_f128(fx, a, b);
+            let val = CValue::by_val(val, fx.layout_of(fx.tcx.types.f128));
+            ret.write_cvalue(fx, val);
+        }
+
+        sym::minnumf16 => {
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
 
+            let val = crate::num::codegen_float_min(fx, a, b);
+            let val = CValue::by_val(val, fx.layout_of(fx.tcx.types.f16));
+            ret.write_cvalue(fx, val);
+        }
         sym::minnumf32 => {
             intrinsic_args!(fx, args => (a, b); intrinsic);
             let a = a.load_scalar(fx);
@@ -1173,6 +1233,24 @@ fn codegen_regular_intrinsic_call<'tcx>(
             let val = CValue::by_val(val, fx.layout_of(fx.tcx.types.f64));
             ret.write_cvalue(fx, val);
         }
+        sym::minnumf128 => {
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
+
+            let val = crate::num::codegen_float_min(fx, a, b);
+            let val = CValue::by_val(val, fx.layout_of(fx.tcx.types.f128));
+            ret.write_cvalue(fx, val);
+        }
+        sym::maxnumf16 => {
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
+
+            let val = crate::num::codegen_float_max(fx, a, b);
+            let val = CValue::by_val(val, fx.layout_of(fx.tcx.types.f16));
+            ret.write_cvalue(fx, val);
+        }
         sym::maxnumf32 => {
             intrinsic_args!(fx, args => (a, b); intrinsic);
             let a = a.load_scalar(fx);
@@ -1191,6 +1269,15 @@ fn codegen_regular_intrinsic_call<'tcx>(
             let val = CValue::by_val(val, fx.layout_of(fx.tcx.types.f64));
             ret.write_cvalue(fx, val);
         }
+        sym::maxnumf128 => {
+            intrinsic_args!(fx, args => (a, b); intrinsic);
+            let a = a.load_scalar(fx);
+            let b = b.load_scalar(fx);
+
+            let val = crate::num::codegen_float_max(fx, a, b);
+            let val = CValue::by_val(val, fx.layout_of(fx.tcx.types.f128));
+            ret.write_cvalue(fx, val);
+        }
 
         sym::catch_unwind => {
             intrinsic_args!(fx, args => (f, data, catch_fn); intrinsic);
diff --git a/src/num.rs b/src/num.rs
index 3ed276267..f53045df6 100644
--- a/src/num.rs
+++ b/src/num.rs
@@ -416,7 +416,10 @@ pub(crate) fn codegen_float_binop<'tcx>(
                 BinOp::Gt => FloatCC::GreaterThan,
                 _ => unreachable!(),
             };
-            let val = fx.bcx.ins().fcmp(fltcc, lhs, rhs);
+            // FIXME(bytecodealliance/wasmtime#8312): Replace with Cranelift
+            // `fcmp` once `f16`/`f128` backend lowerings have been added to
+            // Cranelift.
+            let val = codegen_f16_f128::fcmp(fx, fltcc, lhs, rhs);
             return CValue::by_val(val, fx.layout_of(fx.tcx.types.bool));
         }
         _ => unreachable!("{:?}({:?}, {:?})", bin_op, in_lhs, in_rhs),
@@ -500,15 +503,19 @@ fn codegen_ptr_binop<'tcx>(
 // and `a.is_nan() ? b : (a <= b ? b : a)` for `maxnumf*`. NaN checks are done by comparing
 // a float against itself. Only in case of NaN is it not equal to itself.
 pub(crate) fn codegen_float_min(fx: &mut FunctionCx<'_, '_, '_>, a: Value, b: Value) -> Value {
-    let a_is_nan = fx.bcx.ins().fcmp(FloatCC::NotEqual, a, a);
-    let a_ge_b = fx.bcx.ins().fcmp(FloatCC::GreaterThanOrEqual, a, b);
+    // FIXME(bytecodealliance/wasmtime#8312): Replace with Cranelift `fcmp` once
+    // `f16`/`f128` backend lowerings have been added to Cranelift.
+    let a_is_nan = codegen_f16_f128::fcmp(fx, FloatCC::NotEqual, a, a);
+    let a_ge_b = codegen_f16_f128::fcmp(fx, FloatCC::GreaterThanOrEqual, a, b);
     let temp = fx.bcx.ins().select(a_ge_b, b, a);
     fx.bcx.ins().select(a_is_nan, b, temp)
 }
 
 pub(crate) fn codegen_float_max(fx: &mut FunctionCx<'_, '_, '_>, a: Value, b: Value) -> Value {
-    let a_is_nan = fx.bcx.ins().fcmp(FloatCC::NotEqual, a, a);
-    let a_le_b = fx.bcx.ins().fcmp(FloatCC::LessThanOrEqual, a, b);
+    // FIXME(bytecodealliance/wasmtime#8312): Replace with Cranelift `fcmp` once
+    // `f16`/`f128` backend lowerings have been added to Cranelift.
+    let a_is_nan = codegen_f16_f128::fcmp(fx, FloatCC::NotEqual, a, a);
+    let a_le_b = codegen_f16_f128::fcmp(fx, FloatCC::LessThanOrEqual, a, b);
     let temp = fx.bcx.ins().select(a_le_b, b, a);
     fx.bcx.ins().select(a_is_nan, b, temp)
 }

From e46186f9944cf10c35b228e38ba6312903587896 Mon Sep 17 00:00:00 2001
From: beetrees <b@beetr.ee>
Date: Fri, 23 May 2025 15:51:51 +0100
Subject: [PATCH 6/8] Add support for casting to and from `f16`/`f128`

---
 src/cast.rs              |  21 ++++++-
 src/codegen_f16_f128.rs  | 123 +++++++++++++++++++++++++++++++++++++++
 src/compiler_builtins.rs |  19 ++++++
 3 files changed, 161 insertions(+), 2 deletions(-)

diff --git a/src/cast.rs b/src/cast.rs
index e23463242..8a725680e 100644
--- a/src/cast.rs
+++ b/src/cast.rs
@@ -1,5 +1,6 @@
 //! Various number casting functions
 
+use crate::codegen_f16_f128;
 use crate::prelude::*;
 
 pub(crate) fn clif_intcast(
@@ -36,6 +37,14 @@ pub(crate) fn clif_int_or_float_cast(
 ) -> Value {
     let from_ty = fx.bcx.func.dfg.value_type(from);
 
+    // FIXME(bytecodealliance/wasmtime#8312): Remove in favour of native
+    // Cranelift operations once Cranelift backends have lowerings for them.
+    if matches!(from_ty, types::F16 | types::F128)
+        || matches!(to_ty, types::F16 | types::F128) && from_ty != to_ty
+    {
+        return codegen_f16_f128::codegen_cast(fx, from, from_signed, to_ty, to_signed);
+    }
+
     if from_ty.is_int() && to_ty.is_int() {
         // int-like -> int-like
         clif_intcast(
@@ -58,8 +67,10 @@ pub(crate) fn clif_int_or_float_cast(
                 "__float{sign}ti{flt}f",
                 sign = if from_signed { "" } else { "un" },
                 flt = match to_ty {
+                    types::F16 => "h",
                     types::F32 => "s",
                     types::F64 => "d",
+                    types::F128 => "t",
                     _ => unreachable!("{:?}", to_ty),
                 },
             );
@@ -90,8 +101,10 @@ pub(crate) fn clif_int_or_float_cast(
                 "__fix{sign}{flt}fti",
                 sign = if to_signed { "" } else { "uns" },
                 flt = match from_ty {
+                    types::F16 => "h",
                     types::F32 => "s",
                     types::F64 => "d",
+                    types::F128 => "t",
                     _ => unreachable!("{:?}", to_ty),
                 },
             );
@@ -145,8 +158,12 @@ pub(crate) fn clif_int_or_float_cast(
     } else if from_ty.is_float() && to_ty.is_float() {
         // float -> float
         match (from_ty, to_ty) {
-            (types::F32, types::F64) => fx.bcx.ins().fpromote(types::F64, from),
-            (types::F64, types::F32) => fx.bcx.ins().fdemote(types::F32, from),
+            (types::F16, types::F32 | types::F64 | types::F128)
+            | (types::F32, types::F64 | types::F128)
+            | (types::F64, types::F128) => fx.bcx.ins().fpromote(to_ty, from),
+            (types::F128, types::F64 | types::F32 | types::F16)
+            | (types::F64, types::F32 | types::F16)
+            | (types::F32, types::F16) => fx.bcx.ins().fdemote(to_ty, from),
             _ => from,
         }
     } else {
diff --git a/src/codegen_f16_f128.rs b/src/codegen_f16_f128.rs
index c570fbbd9..a887341ce 100644
--- a/src/codegen_f16_f128.rs
+++ b/src/codegen_f16_f128.rs
@@ -13,6 +13,11 @@ pub(crate) fn f16_to_f32(fx: &mut FunctionCx<'_, '_, '_>, value: Value) -> Value
     fx.lib_call("__extendhfsf2", vec![arg_ty], vec![AbiParam::new(types::F32)], &[value])[0]
 }
 
+fn f16_to_f64(fx: &mut FunctionCx<'_, '_, '_>, value: Value) -> Value {
+    let ret = f16_to_f32(fx, value);
+    fx.bcx.ins().fpromote(types::F64, ret)
+}
+
 pub(crate) fn f32_to_f16(fx: &mut FunctionCx<'_, '_, '_>, value: Value) -> Value {
     let ret_ty = if fx.tcx.sess.target.vendor == "apple" && fx.tcx.sess.target.arch == "x86_64" {
         types::I16
@@ -28,6 +33,21 @@ pub(crate) fn f32_to_f16(fx: &mut FunctionCx<'_, '_, '_>, value: Value) -> Value
     if ret_ty == types::I16 { fx.bcx.ins().bitcast(types::F16, MemFlags::new(), ret) } else { ret }
 }
 
+fn f64_to_f16(fx: &mut FunctionCx<'_, '_, '_>, value: Value) -> Value {
+    let ret_ty = if fx.tcx.sess.target.vendor == "apple" && fx.tcx.sess.target.arch == "x86_64" {
+        types::I16
+    } else {
+        types::F16
+    };
+    let ret = fx.lib_call(
+        "__truncdfhf2",
+        vec![AbiParam::new(types::F64)],
+        vec![AbiParam::new(ret_ty)],
+        &[value],
+    )[0];
+    if ret_ty == types::I16 { fx.bcx.ins().bitcast(types::F16, MemFlags::new(), ret) } else { ret }
+}
+
 pub(crate) fn fcmp(fx: &mut FunctionCx<'_, '_, '_>, cc: FloatCC, lhs: Value, rhs: Value) -> Value {
     let ty = fx.bcx.func.dfg.value_type(lhs);
     match ty {
@@ -99,6 +119,109 @@ pub(crate) fn neg_f128(fx: &mut FunctionCx<'_, '_, '_>, value: Value) -> Value {
     fx.bcx.ins().bitcast(types::F128, MemFlags::new(), bits)
 }
 
+pub(crate) fn codegen_cast(
+    fx: &mut FunctionCx<'_, '_, '_>,
+    from: Value,
+    from_signed: bool,
+    to_ty: Type,
+    to_signed: bool,
+) -> Value {
+    let from_ty = fx.bcx.func.dfg.value_type(from);
+    if from_ty.is_float() && to_ty.is_float() {
+        let name = match (from_ty, to_ty) {
+            (types::F16, types::F32) => return f16_to_f32(fx, from),
+            (types::F16, types::F64) => return f16_to_f64(fx, from),
+            (types::F16, types::F128) => "__extendhftf2",
+            (types::F32, types::F128) => "__extendsftf2",
+            (types::F64, types::F128) => "__extenddftf2",
+            (types::F128, types::F64) => "__trunctfdf2",
+            (types::F128, types::F32) => "__trunctfsf2",
+            (types::F128, types::F16) => "__trunctfhf2",
+            (types::F64, types::F16) => return f64_to_f16(fx, from),
+            (types::F32, types::F16) => return f32_to_f16(fx, from),
+            _ => unreachable!("{from_ty:?} -> {to_ty:?}"),
+        };
+        fx.lib_call(name, vec![AbiParam::new(from_ty)], vec![AbiParam::new(to_ty)], &[from])[0]
+    } else if from_ty.is_int() && to_ty == types::F16 {
+        let res = clif_int_or_float_cast(fx, from, from_signed, types::F32, false);
+        f32_to_f16(fx, res)
+    } else if from_ty == types::F16 && to_ty.is_int() {
+        let from = f16_to_f32(fx, from);
+        clif_int_or_float_cast(fx, from, false, to_ty, to_signed)
+    } else if from_ty.is_int() && to_ty == types::F128 {
+        let (from, from_ty) = if from_ty.bits() < 32 {
+            (clif_int_or_float_cast(fx, from, from_signed, types::I32, from_signed), types::I32)
+        } else {
+            (from, from_ty)
+        };
+        let name = format!(
+            "__float{sign}{size}itf",
+            sign = if from_signed { "" } else { "un" },
+            size = match from_ty {
+                types::I32 => 's',
+                types::I64 => 'd',
+                types::I128 => 't',
+                _ => unreachable!("{from_ty:?}"),
+            },
+        );
+        fx.lib_call(
+            &name,
+            vec![lib_call_arg_param(fx.tcx, from_ty, from_signed)],
+            vec![AbiParam::new(to_ty)],
+            &[from],
+        )[0]
+    } else if from_ty == types::F128 && to_ty.is_int() {
+        let ret_ty = if to_ty.bits() < 32 { types::I32 } else { to_ty };
+        let name = format!(
+            "__fix{sign}tf{size}i",
+            sign = if from_signed { "" } else { "un" },
+            size = match ret_ty {
+                types::I32 => 's',
+                types::I64 => 'd',
+                types::I128 => 't',
+                _ => unreachable!("{from_ty:?}"),
+            },
+        );
+        let ret =
+            fx.lib_call(&name, vec![AbiParam::new(from_ty)], vec![AbiParam::new(to_ty)], &[from])
+                [0];
+        let val = if ret_ty == to_ty {
+            ret
+        } else {
+            let (min, max) = match (to_ty, to_signed) {
+                (types::I8, false) => (0, i64::from(u8::MAX)),
+                (types::I16, false) => (0, i64::from(u16::MAX)),
+                (types::I8, true) => (i64::from(i8::MIN as u32), i64::from(i8::MAX as u32)),
+                (types::I16, true) => (i64::from(i16::MIN as u32), i64::from(i16::MAX as u32)),
+                _ => unreachable!("{to_ty:?}"),
+            };
+            let min_val = fx.bcx.ins().iconst(types::I32, min);
+            let max_val = fx.bcx.ins().iconst(types::I32, max);
+
+            let val = if to_signed {
+                let has_underflow = fx.bcx.ins().icmp_imm(IntCC::SignedLessThan, ret, min);
+                let has_overflow = fx.bcx.ins().icmp_imm(IntCC::SignedGreaterThan, ret, max);
+                let bottom_capped = fx.bcx.ins().select(has_underflow, min_val, ret);
+                fx.bcx.ins().select(has_overflow, max_val, bottom_capped)
+            } else {
+                let has_overflow = fx.bcx.ins().icmp_imm(IntCC::UnsignedGreaterThan, ret, max);
+                fx.bcx.ins().select(has_overflow, max_val, ret)
+            };
+            fx.bcx.ins().ireduce(to_ty, val)
+        };
+
+        if let Some(false) = fx.tcx.sess.opts.unstable_opts.saturating_float_casts {
+            return val;
+        }
+
+        let is_not_nan = fcmp(fx, FloatCC::Equal, from, from);
+        let zero = type_zero_value(&mut fx.bcx, to_ty);
+        fx.bcx.ins().select(is_not_nan, val, zero)
+    } else {
+        unreachable!("{from_ty:?} -> {to_ty:?}");
+    }
+}
+
 pub(crate) fn fmin_f128(fx: &mut FunctionCx<'_, '_, '_>, a: Value, b: Value) -> Value {
     fx.lib_call(
         "fminimumf128",
diff --git a/src/compiler_builtins.rs b/src/compiler_builtins.rs
index 017a1370a..d9cbffd7a 100644
--- a/src/compiler_builtins.rs
+++ b/src/compiler_builtins.rs
@@ -49,15 +49,34 @@ builtin_functions! {
     // integer -> float
     fn __floattisf(i: i128) -> f32;
     fn __floattidf(i: i128) -> f64;
+    fn __floatsitf(i: i32) -> f128;
+    fn __floatditf(i: i64) -> f128;
+    fn __floattitf(i: i128) -> f128;
     fn __floatuntisf(i: u128) -> f32;
     fn __floatuntidf(i: u128) -> f64;
+    fn __floatunsitf(i: u32) -> f128;
+    fn __floatunditf(i: u64) -> f128;
+    fn __floatuntitf(i: u128) -> f128;
     // float -> integer
     fn __fixsfti(f: f32) -> i128;
     fn __fixdfti(f: f64) -> i128;
+    fn __fixtfsi(f: f128) -> i32;
+    fn __fixtfdi(f: f128) -> i64;
+    fn __fixtfti(f: f128) -> i128;
     fn __fixunssfti(f: f32) -> u128;
     fn __fixunsdfti(f: f64) -> u128;
+    fn __fixunstfsi(f: f128) -> u32;
+    fn __fixunstfdi(f: f128) -> u64;
+    fn __fixunstfti(f: f128) -> u128;
     // float -> float
     fn __extendhfsf2(f: f16) -> f32;
+    fn __extendhftf2(f: f16) -> f128;
+    fn __extendsftf2(f: f32) -> f128;
+    fn __extenddftf2(f: f64) -> f128;
+    fn __trunctfdf2(f: f128) -> f64;
+    fn __trunctfsf2(f: f128) -> f32;
+    fn __trunctfhf2(f: f128) -> f16;
+    fn __truncdfhf2(f: f64) -> f16;
     fn __truncsfhf2(f: f32) -> f16;
     // float binops
     fn __addtf3(a: f128, b: f128) -> f128;

From 2055f01323fbeada8857f9a0725c12567891c377 Mon Sep 17 00:00:00 2001
From: beetrees <b@beetr.ee>
Date: Fri, 23 May 2025 15:51:51 +0100
Subject: [PATCH 7/8] Add `f16`/`f128` intrinsic support

---
 src/codegen_f16_f128.rs  | 43 ++++++++++++++++++++++
 src/compiler_builtins.rs | 15 ++++++++
 src/intrinsics/mod.rs    | 79 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 136 insertions(+), 1 deletion(-)

diff --git a/src/codegen_f16_f128.rs b/src/codegen_f16_f128.rs
index a887341ce..1e202be1f 100644
--- a/src/codegen_f16_f128.rs
+++ b/src/codegen_f16_f128.rs
@@ -119,6 +119,41 @@ pub(crate) fn neg_f128(fx: &mut FunctionCx<'_, '_, '_>, value: Value) -> Value {
     fx.bcx.ins().bitcast(types::F128, MemFlags::new(), bits)
 }
 
+pub(crate) fn abs_f16(fx: &mut FunctionCx<'_, '_, '_>, value: Value) -> Value {
+    let bits = fx.bcx.ins().bitcast(types::I16, MemFlags::new(), value);
+    let bits = fx.bcx.ins().band_imm(bits, 0x7fff);
+    fx.bcx.ins().bitcast(types::F16, MemFlags::new(), bits)
+}
+
+pub(crate) fn abs_f128(fx: &mut FunctionCx<'_, '_, '_>, value: Value) -> Value {
+    let bits = fx.bcx.ins().bitcast(types::I128, MemFlags::new(), value);
+    let (low, high) = fx.bcx.ins().isplit(bits);
+    let high = fx.bcx.ins().band_imm(high, 0x7fff_ffff_ffff_ffff_u64 as i64);
+    let bits = fx.bcx.ins().iconcat(low, high);
+    fx.bcx.ins().bitcast(types::F128, MemFlags::new(), bits)
+}
+
+pub(crate) fn copysign_f16(fx: &mut FunctionCx<'_, '_, '_>, lhs: Value, rhs: Value) -> Value {
+    let lhs = fx.bcx.ins().bitcast(types::I16, MemFlags::new(), lhs);
+    let rhs = fx.bcx.ins().bitcast(types::I16, MemFlags::new(), rhs);
+    let res = fx.bcx.ins().band_imm(lhs, 0x7fff);
+    let sign = fx.bcx.ins().band_imm(rhs, 0x8000);
+    let res = fx.bcx.ins().bor(res, sign);
+    fx.bcx.ins().bitcast(types::F16, MemFlags::new(), res)
+}
+
+pub(crate) fn copysign_f128(fx: &mut FunctionCx<'_, '_, '_>, lhs: Value, rhs: Value) -> Value {
+    let lhs = fx.bcx.ins().bitcast(types::I128, MemFlags::new(), lhs);
+    let rhs = fx.bcx.ins().bitcast(types::I128, MemFlags::new(), rhs);
+    let (low, lhs_high) = fx.bcx.ins().isplit(lhs);
+    let (_, rhs_high) = fx.bcx.ins().isplit(rhs);
+    let high = fx.bcx.ins().band_imm(lhs_high, 0x7fff_ffff_ffff_ffff_u64 as i64);
+    let sign = fx.bcx.ins().band_imm(rhs_high, 0x8000_0000_0000_0000_u64 as i64);
+    let high = fx.bcx.ins().bor(high, sign);
+    let res = fx.bcx.ins().iconcat(low, high);
+    fx.bcx.ins().bitcast(types::F128, MemFlags::new(), res)
+}
+
 pub(crate) fn codegen_cast(
     fx: &mut FunctionCx<'_, '_, '_>,
     from: Value,
@@ -222,6 +257,14 @@ pub(crate) fn codegen_cast(
     }
 }
 
+pub(crate) fn fma_f16(fx: &mut FunctionCx<'_, '_, '_>, x: Value, y: Value, z: Value) -> Value {
+    let x = f16_to_f64(fx, x);
+    let y = f16_to_f64(fx, y);
+    let z = f16_to_f64(fx, z);
+    let res = fx.bcx.ins().fma(x, y, z);
+    f64_to_f16(fx, res)
+}
+
 pub(crate) fn fmin_f128(fx: &mut FunctionCx<'_, '_, '_>, a: Value, b: Value) -> Value {
     fx.lib_call(
         "fminimumf128",
diff --git a/src/compiler_builtins.rs b/src/compiler_builtins.rs
index d9cbffd7a..6eea19211 100644
--- a/src/compiler_builtins.rs
+++ b/src/compiler_builtins.rs
@@ -109,6 +109,8 @@ builtin_functions! {
     // float intrinsics
     fn __powisf2(a: f32, b: i32) -> f32;
     fn __powidf2(a: f64, b: i32) -> f64;
+    // FIXME(f16_f128): `compiler-builtins` doesn't currently support `__powitf2` on MSVC.
+    // fn __powitf2(a: f128, b: i32) -> f128;
     fn powf(a: f32, b: f32) -> f32;
     fn pow(a: f64, b: f64) -> f64;
     fn expf(f: f32) -> f32;
@@ -125,6 +127,19 @@ builtin_functions! {
     fn sin(f: f64) -> f64;
     fn cosf(f: f32) -> f32;
     fn cos(f: f64) -> f64;
+    fn fmaf128(a: f128, b: f128, c: f128) -> f128;
+    fn floorf16(f: f16) -> f16;
+    fn floorf128(f: f128) -> f128;
+    fn ceilf16(f: f16) -> f16;
+    fn ceilf128(f: f128) -> f128;
+    fn truncf16(f: f16) -> f16;
+    fn truncf128(f: f128) -> f128;
+    fn rintf16(f: f16) -> f16;
+    fn rintf128(f: f128) -> f128;
+    fn sqrtf16(f: f16) -> f16;
+    fn sqrtf128(f: f128) -> f128;
+    // FIXME(f16_f128): Add other float intrinsics as compiler-builtins gains support (meaning they
+    // are available on all targets).
 
     // allocator
     // NOTE: These need to be mentioned here despite not being part of compiler_builtins because
diff --git a/src/intrinsics/mod.rs b/src/intrinsics/mod.rs
index 6481d2112..b21ca32c9 100644
--- a/src/intrinsics/mod.rs
+++ b/src/intrinsics/mod.rs
@@ -249,8 +249,10 @@ fn bool_to_zero_or_max_uint<'tcx>(
     let ty = fx.clif_type(ty).unwrap();
 
     let int_ty = match ty {
+        types::F16 => types::I16,
         types::F32 => types::I32,
         types::F64 => types::I64,
+        types::F128 => types::I128,
         ty => ty,
     };
 
@@ -309,45 +311,83 @@ fn codegen_float_intrinsic_call<'tcx>(
     ret: CPlace<'tcx>,
 ) -> bool {
     let (name, arg_count, ty, clif_ty) = match intrinsic {
+        sym::expf16 => ("expf16", 1, fx.tcx.types.f16, types::F16),
         sym::expf32 => ("expf", 1, fx.tcx.types.f32, types::F32),
         sym::expf64 => ("exp", 1, fx.tcx.types.f64, types::F64),
+        sym::expf128 => ("expf128", 1, fx.tcx.types.f128, types::F128),
+        sym::exp2f16 => ("exp2f16", 1, fx.tcx.types.f16, types::F16),
         sym::exp2f32 => ("exp2f", 1, fx.tcx.types.f32, types::F32),
         sym::exp2f64 => ("exp2", 1, fx.tcx.types.f64, types::F64),
+        sym::exp2f128 => ("exp2f128", 1, fx.tcx.types.f128, types::F128),
+        sym::sqrtf16 => ("sqrtf16", 1, fx.tcx.types.f16, types::F16),
         sym::sqrtf32 => ("sqrtf", 1, fx.tcx.types.f32, types::F32),
         sym::sqrtf64 => ("sqrt", 1, fx.tcx.types.f64, types::F64),
+        sym::sqrtf128 => ("sqrtf128", 1, fx.tcx.types.f128, types::F128),
+        sym::powif16 => ("__powisf2", 2, fx.tcx.types.f16, types::F16), // compiler-builtins
         sym::powif32 => ("__powisf2", 2, fx.tcx.types.f32, types::F32), // compiler-builtins
         sym::powif64 => ("__powidf2", 2, fx.tcx.types.f64, types::F64), // compiler-builtins
+        sym::powif128 => ("__powitf2", 2, fx.tcx.types.f128, types::F128), // compiler-builtins
+        sym::powf16 => ("powf16", 2, fx.tcx.types.f16, types::F16),
         sym::powf32 => ("powf", 2, fx.tcx.types.f32, types::F32),
         sym::powf64 => ("pow", 2, fx.tcx.types.f64, types::F64),
+        sym::powf128 => ("powf128", 2, fx.tcx.types.f128, types::F128),
+        sym::logf16 => ("logf16", 1, fx.tcx.types.f16, types::F16),
         sym::logf32 => ("logf", 1, fx.tcx.types.f32, types::F32),
         sym::logf64 => ("log", 1, fx.tcx.types.f64, types::F64),
+        sym::logf128 => ("logf128", 1, fx.tcx.types.f128, types::F128),
+        sym::log2f16 => ("log2f16", 1, fx.tcx.types.f16, types::F16),
         sym::log2f32 => ("log2f", 1, fx.tcx.types.f32, types::F32),
         sym::log2f64 => ("log2", 1, fx.tcx.types.f64, types::F64),
+        sym::log2f128 => ("log2f128", 1, fx.tcx.types.f128, types::F128),
+        sym::log10f16 => ("log10f16", 1, fx.tcx.types.f16, types::F16),
         sym::log10f32 => ("log10f", 1, fx.tcx.types.f32, types::F32),
         sym::log10f64 => ("log10", 1, fx.tcx.types.f64, types::F64),
+        sym::log10f128 => ("log10f128", 1, fx.tcx.types.f128, types::F128),
+        sym::fabsf16 => ("fabsf16", 1, fx.tcx.types.f16, types::F16),
         sym::fabsf32 => ("fabsf", 1, fx.tcx.types.f32, types::F32),
         sym::fabsf64 => ("fabs", 1, fx.tcx.types.f64, types::F64),
+        sym::fabsf128 => ("fabsf128", 1, fx.tcx.types.f128, types::F128),
+        sym::fmaf16 => ("fmaf16", 3, fx.tcx.types.f16, types::F16),
         sym::fmaf32 => ("fmaf", 3, fx.tcx.types.f32, types::F32),
         sym::fmaf64 => ("fma", 3, fx.tcx.types.f64, types::F64),
+        sym::fmaf128 => ("fmaf128", 3, fx.tcx.types.f128, types::F128),
         // FIXME: calling `fma` from libc without FMA target feature uses expensive sofware emulation
+        sym::fmuladdf16 => ("fmaf16", 3, fx.tcx.types.f16, types::F16), // TODO: use cranelift intrinsic analogous to llvm.fmuladd.f16
         sym::fmuladdf32 => ("fmaf", 3, fx.tcx.types.f32, types::F32), // TODO: use cranelift intrinsic analogous to llvm.fmuladd.f32
         sym::fmuladdf64 => ("fma", 3, fx.tcx.types.f64, types::F64), // TODO: use cranelift intrinsic analogous to llvm.fmuladd.f64
+        sym::fmuladdf128 => ("fmaf128", 3, fx.tcx.types.f128, types::F128), // TODO: use cranelift intrinsic analogous to llvm.fmuladd.f128
+        sym::copysignf16 => ("copysignf16", 2, fx.tcx.types.f16, types::F16),
         sym::copysignf32 => ("copysignf", 2, fx.tcx.types.f32, types::F32),
         sym::copysignf64 => ("copysign", 2, fx.tcx.types.f64, types::F64),
+        sym::copysignf128 => ("copysignf128", 2, fx.tcx.types.f128, types::F128),
+        sym::floorf16 => ("floorf16", 1, fx.tcx.types.f16, types::F16),
         sym::floorf32 => ("floorf", 1, fx.tcx.types.f32, types::F32),
         sym::floorf64 => ("floor", 1, fx.tcx.types.f64, types::F64),
+        sym::floorf128 => ("floorf128", 1, fx.tcx.types.f128, types::F128),
+        sym::ceilf16 => ("ceilf16", 1, fx.tcx.types.f16, types::F16),
         sym::ceilf32 => ("ceilf", 1, fx.tcx.types.f32, types::F32),
         sym::ceilf64 => ("ceil", 1, fx.tcx.types.f64, types::F64),
+        sym::ceilf128 => ("ceilf128", 1, fx.tcx.types.f128, types::F128),
+        sym::truncf16 => ("truncf16", 1, fx.tcx.types.f16, types::F16),
         sym::truncf32 => ("truncf", 1, fx.tcx.types.f32, types::F32),
         sym::truncf64 => ("trunc", 1, fx.tcx.types.f64, types::F64),
+        sym::truncf128 => ("truncf128", 1, fx.tcx.types.f128, types::F128),
+        sym::round_ties_even_f16 => ("rintf16", 1, fx.tcx.types.f16, types::F16),
         sym::round_ties_even_f32 => ("rintf", 1, fx.tcx.types.f32, types::F32),
         sym::round_ties_even_f64 => ("rint", 1, fx.tcx.types.f64, types::F64),
+        sym::round_ties_even_f128 => ("rintf128", 1, fx.tcx.types.f128, types::F128),
+        sym::roundf16 => ("roundf16", 1, fx.tcx.types.f16, types::F16),
         sym::roundf32 => ("roundf", 1, fx.tcx.types.f32, types::F32),
         sym::roundf64 => ("round", 1, fx.tcx.types.f64, types::F64),
+        sym::roundf128 => ("roundf128", 1, fx.tcx.types.f128, types::F128),
+        sym::sinf16 => ("sinf16", 1, fx.tcx.types.f16, types::F16),
         sym::sinf32 => ("sinf", 1, fx.tcx.types.f32, types::F32),
         sym::sinf64 => ("sin", 1, fx.tcx.types.f64, types::F64),
+        sym::sinf128 => ("sinf128", 1, fx.tcx.types.f128, types::F128),
+        sym::cosf16 => ("cosf16", 1, fx.tcx.types.f16, types::F16),
         sym::cosf32 => ("cosf", 1, fx.tcx.types.f32, types::F32),
         sym::cosf64 => ("cos", 1, fx.tcx.types.f64, types::F64),
+        sym::cosf128 => ("cosf128", 1, fx.tcx.types.f128, types::F128),
         _ => return false,
     };
 
@@ -380,13 +420,26 @@ fn codegen_float_intrinsic_call<'tcx>(
     };
 
     let layout = fx.layout_of(ty);
+    // FIXME(bytecodealliance/wasmtime#8312): Use native Cranelift operations
+    // for `f16` and `f128` once the lowerings have been implemented in Cranelift.
     let res = match intrinsic {
+        sym::fmaf16 | sym::fmuladdf16 => {
+            CValue::by_val(codegen_f16_f128::fma_f16(fx, args[0], args[1], args[2]), layout)
+        }
         sym::fmaf32 | sym::fmaf64 | sym::fmuladdf32 | sym::fmuladdf64 => {
             CValue::by_val(fx.bcx.ins().fma(args[0], args[1], args[2]), layout)
         }
+        sym::copysignf16 => {
+            CValue::by_val(codegen_f16_f128::copysign_f16(fx, args[0], args[1]), layout)
+        }
+        sym::copysignf128 => {
+            CValue::by_val(codegen_f16_f128::copysign_f128(fx, args[0], args[1]), layout)
+        }
         sym::copysignf32 | sym::copysignf64 => {
             CValue::by_val(fx.bcx.ins().fcopysign(args[0], args[1]), layout)
         }
+        sym::fabsf16 => CValue::by_val(codegen_f16_f128::abs_f16(fx, args[0]), layout),
+        sym::fabsf128 => CValue::by_val(codegen_f16_f128::abs_f128(fx, args[0]), layout),
         sym::fabsf32
         | sym::fabsf64
         | sym::floorf32
@@ -416,12 +469,36 @@ fn codegen_float_intrinsic_call<'tcx>(
 
         // These intrinsics aren't supported natively by Cranelift.
         // Lower them to a libcall.
-        sym::powif32 | sym::powif64 => {
+        sym::powif16 | sym::powif32 | sym::powif64 | sym::powif128 => {
+            let temp;
+            let (clif_ty, args) = if intrinsic == sym::powif16 {
+                temp = [codegen_f16_f128::f16_to_f32(fx, args[0]), args[1]];
+                (types::F32, temp.as_slice())
+            } else {
+                (clif_ty, args)
+            };
             let input_tys: Vec<_> =
                 vec![AbiParam::new(clif_ty), lib_call_arg_param(fx.tcx, types::I32, true)];
             let ret_val = fx.lib_call(name, input_tys, vec![AbiParam::new(clif_ty)], &args)[0];
+            let ret_val = if intrinsic == sym::powif16 {
+                codegen_f16_f128::f32_to_f16(fx, ret_val)
+            } else {
+                ret_val
+            };
             CValue::by_val(ret_val, fx.layout_of(ty))
         }
+        sym::powf16 => {
+            // FIXME(f16_f128): Rust `compiler-builtins` doesn't export `powf16` yet.
+            let x = codegen_f16_f128::f16_to_f32(fx, args[0]);
+            let y = codegen_f16_f128::f16_to_f32(fx, args[1]);
+            let ret_val = fx.lib_call(
+                "powf",
+                vec![AbiParam::new(types::F32), AbiParam::new(types::F32)],
+                vec![AbiParam::new(types::F32)],
+                &[x, y],
+            )[0];
+            CValue::by_val(codegen_f16_f128::f32_to_f16(fx, ret_val), fx.layout_of(ty))
+        }
         _ => {
             let input_tys: Vec<_> = args.iter().map(|_| AbiParam::new(clif_ty)).collect();
             let ret_val = fx.lib_call(name, input_tys, vec![AbiParam::new(clif_ty)], &args)[0];

From 02195f56c718defd1df04a452530b14cc96155c9 Mon Sep 17 00:00:00 2001
From: beetrees <b@beetr.ee>
Date: Fri, 23 May 2025 15:51:51 +0100
Subject: [PATCH 8/8] Enable tests and `compiler-builtins` for `f16`/`f128`

---
 build_system/build_sysroot.rs |  2 +-
 scripts/setup_rust_fork.sh    |  2 +-
 scripts/test_rustc_tests.sh   |  8 --------
 src/lib.rs                    | 32 +++++++++++++++++++++++++++-----
 4 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/build_system/build_sysroot.rs b/build_system/build_sysroot.rs
index a6e956c51..00955998e 100644
--- a/build_system/build_sysroot.rs
+++ b/build_system/build_sysroot.rs
@@ -235,7 +235,7 @@ fn build_clif_sysroot_for_triple(
     compiler.rustflags.extend(rustflags);
     let mut build_cmd = STANDARD_LIBRARY.build(&compiler, dirs);
     build_cmd.arg("--release");
-    build_cmd.arg("--features").arg("backtrace panic-unwind compiler-builtins-no-f16-f128");
+    build_cmd.arg("--features").arg("backtrace panic-unwind");
     build_cmd.arg(format!("-Zroot-dir={}", STDLIB_SRC.to_path(dirs).display()));
     build_cmd.env("CARGO_PROFILE_RELEASE_DEBUG", "true");
     build_cmd.env("__CARGO_DEFAULT_LIB_METADATA", "cg_clif");
diff --git a/scripts/setup_rust_fork.sh b/scripts/setup_rust_fork.sh
index ca6426f2b..532702bb1 100644
--- a/scripts/setup_rust_fork.sh
+++ b/scripts/setup_rust_fork.sh
@@ -43,7 +43,7 @@ verbose-tests = false
 # disabled bootstrap will crash trying to copy llvm tools for the bootstrap
 # compiler.
 llvm-tools = false
-std-features = ["panic-unwind", "compiler-builtins-no-f16-f128"]
+std-features = ["panic-unwind"]
 
 EOF
 
diff --git a/scripts/test_rustc_tests.sh b/scripts/test_rustc_tests.sh
index d014b6881..32c71f433 100755
--- a/scripts/test_rustc_tests.sh
+++ b/scripts/test_rustc_tests.sh
@@ -72,14 +72,6 @@ rm tests/ui/consts/precise-drop-with-coverage.rs
 rm tests/ui/issues/issue-85461.rs
 rm -r tests/ui/instrument-coverage/
 
-# missing f16/f128 support
-rm tests/ui/half-open-range-patterns/half-open-range-pats-semantics.rs
-rm tests/ui/asm/aarch64/type-f16.rs
-rm tests/ui/float/conv-bits-runtime-const.rs
-rm tests/ui/consts/const-eval/float_methods.rs
-rm tests/ui/match/match-float.rs
-rm tests/ui/float/target-has-reliable-nightly-float.rs
-
 # optimization tests
 # ==================
 rm tests/ui/codegen/issue-28950.rs # depends on stack size optimizations
diff --git a/src/lib.rs b/src/lib.rs
index 03dfd4956..8ef623cde 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -201,14 +201,36 @@ impl CodegenBackend for CraneliftCodegenBackend {
         // FIXME do `unstable_target_features` properly
         let unstable_target_features = target_features.clone();
 
+        // FIXME(f16_f128): LLVM 20 (currently used by `rustc`) passes `f128` in XMM registers on
+        // Windows, whereas LLVM 21+ and Cranelift pass it indirectly. This means that `f128` won't
+        // work when linking against a LLVM-built sysroot.
+        let has_reliable_f128 = !sess.target.is_like_windows;
+        let has_reliable_f16 = match &*sess.target.arch {
+            // FIXME(f16_f128): LLVM 20 does not support `f16` on s390x, meaning the required
+            // builtins are not available in `compiler-builtins`.
+            "s390x" => false,
+            // FIXME(f16_f128): `rustc_codegen_llvm` currently disables support on Windows GNU
+            // targets due to GCC using a different ABI than LLVM. Therefore `f16` won't be
+            // available when using a LLVM-built sysroot.
+            "x86_64"
+                if sess.target.os == "windows"
+                    && sess.target.env == "gnu"
+                    && sess.target.abi != "llvm" =>
+            {
+                false
+            }
+            _ => true,
+        };
+
         TargetConfig {
             target_features,
             unstable_target_features,
-            // Cranelift does not yet support f16 or f128
-            has_reliable_f16: false,
-            has_reliable_f16_math: false,
-            has_reliable_f128: false,
-            has_reliable_f128_math: false,
+            // `rustc_codegen_cranelift` polyfills functionality not yet
+            // available in Cranelift.
+            has_reliable_f16,
+            has_reliable_f16_math: has_reliable_f16,
+            has_reliable_f128,
+            has_reliable_f128_math: has_reliable_f128,
         }
     }