From ae915aea3243e69e354158087a0aa85c4c7ad25a Mon Sep 17 00:00:00 2001
From: David Keller <davidkeller@tuta.io>
Date: Fri, 24 Sep 2021 17:35:28 +0200
Subject: [PATCH] Int128 compiler-rt methods (Int128 literal support part 1)
 (#11206)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Johannes Müller <straightshoota@gmail.com>
---
 spec/compiler/codegen/arithmetics_spec.cr     |   6 +-
 .../std/crystal/compiler_rt/divmod128_spec.cr |  93 ++++++++
 spec/std/crystal/compiler_rt/mulodi4_spec.cr  |   2 +-
 spec/std/crystal/compiler_rt/mulosi4_spec.cr  |  76 +++++++
 spec/std/crystal/compiler_rt/muloti4_spec.cr  | 151 +++++++++++++
 spec/std/int_spec.cr                          |  68 ++++--
 spec/std/uint_spec.cr                         |   8 +
 spec/win32_std_spec.cr                        |   2 +
 src/crystal/compiler_rt.cr                    |   3 +-
 src/crystal/compiler_rt/divmod128.cr          | 205 ++++++++++++++++++
 src/crystal/compiler_rt/mul.cr                |  42 ++++
 src/crystal/compiler_rt/mulodi4.cr            |  37 ----
 12 files changed, 635 insertions(+), 58 deletions(-)
 create mode 100644 spec/std/crystal/compiler_rt/divmod128_spec.cr
 create mode 100644 spec/std/crystal/compiler_rt/mulosi4_spec.cr
 create mode 100644 spec/std/crystal/compiler_rt/muloti4_spec.cr
 create mode 100644 src/crystal/compiler_rt/divmod128.cr
 create mode 100644 src/crystal/compiler_rt/mul.cr
 delete mode 100644 src/crystal/compiler_rt/mulodi4.cr

diff --git a/spec/compiler/codegen/arithmetics_spec.cr b/spec/compiler/codegen/arithmetics_spec.cr
index adc1ccd32ca8..9bcf008e6b4d 100644
--- a/spec/compiler/codegen/arithmetics_spec.cr
+++ b/spec/compiler/codegen/arithmetics_spec.cr
@@ -1,15 +1,13 @@
 require "../../spec_helper"
 
-{% if flag?(:darwin) %}
+# Int128 and UInt128 specs do not pass on win32 because of missing compiler-rt symbols
+{% unless flag?(:win32) %}
   SupportedInts            = [UInt8, UInt16, UInt32, UInt64, UInt128, Int8, Int16, Int32, Int64, Int128]
   SupportedIntsConversions = {
     to_i8: Int8, to_i16: Int16, to_i32: Int32, to_i64: Int64, to_i128: Int128,
     to_u8: UInt8, to_u16: UInt16, to_u32: UInt32, to_u64: UInt64, to_u128: UInt128,
   }
 {% else %}
-  # Skip Int128 and UInt128 on linux platforms due to compiler-rt dependency.
-  # PreviewOverflowFlags includes compiler_rt flag to support Int64 overflow
-  # detection in 32 bits platforms.
   SupportedInts            = [UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64]
   SupportedIntsConversions = {
     to_i8: Int8, to_i16: Int16, to_i32: Int32, to_i64: Int64,
diff --git a/spec/std/crystal/compiler_rt/divmod128_spec.cr b/spec/std/crystal/compiler_rt/divmod128_spec.cr
new file mode 100644
index 000000000000..5dfd30dc180c
--- /dev/null
+++ b/spec/std/crystal/compiler_rt/divmod128_spec.cr
@@ -0,0 +1,93 @@
+require "spec"
+
+# TODO: Replace helper methods with literals once possible
+
+private def make_ti(a : Int128, b : Int128)
+  (a << 64) + b
+end
+
+private def make_tu(a : UInt128, b : UInt128)
+  (a << 64) + b
+end
+
+# Ported from:
+# - https://github.com/llvm/llvm-project/blob/ce59ccd04023cab3a837da14079ca2dcbfebb70c/compiler-rt/test/builtins/Unit/umodti3_test.c
+# - https://github.com/llvm/llvm-project/blob/ce59ccd04023cab3a837da14079ca2dcbfebb70c/compiler-rt/test/builtins/Unit/udivti3_test.c
+# - https://github.com/llvm/llvm-project/blob/ce59ccd04023cab3a837da14079ca2dcbfebb70c/compiler-rt/test/builtins/Unit/modti3_test.c
+# - https://github.com/llvm/llvm-project/blob/ce59ccd04023cab3a837da14079ca2dcbfebb70c/compiler-rt/test/builtins/Unit/divti3_test.c
+
+private def test__divti3(a : Int128, b : Int128, expected : Int128, file = __FILE__, line = __LINE__)
+  it "passes compiler-rt builtins unit tests" do
+    actual = __divti3(a, b)
+    actual.should eq(expected), file: file, line: line
+  end
+end
+
+private def test__modti3(a : Int128, b : Int128, expected : Int128, file = __FILE__, line = __LINE__)
+  it "passes compiler-rt builtins unit tests" do
+    actual = __modti3(a, b)
+    actual.should eq(expected), file: file, line: line
+  end
+end
+
+private def test__udivti3(a : UInt128, b : UInt128, expected : UInt128, file = __FILE__, line = __LINE__)
+  it "passes compiler-rt builtins unit tests" do
+    actual = __udivti3(a, b)
+    actual.should eq(expected), file: file, line: line
+  end
+end
+
+private def test__umodti3(a : UInt128, b : UInt128, expected : UInt128, file = __FILE__, line = __LINE__)
+  it "passes compiler-rt builtins unit tests" do
+    actual = __umodti3(a, b)
+    actual.should eq(expected), file: file, line: line
+  end
+end
+
+describe "__divti3" do
+  test__divti3(0, 1, 0)
+  test__divti3(0, -1, 0)
+  test__divti3(2, 1, 2)
+  test__divti3(2, -1, -2)
+  test__divti3(-2, 1, -2)
+  test__divti3(-2, -1, 2)
+  test__divti3(make_ti(-9223372036854775808, 0x0), 1, make_ti(-9223372036854775808, 0x0))
+  test__divti3(make_ti(-9223372036854775808, 0x0), -1, make_ti(-9223372036854775808, 0x0))
+  test__divti3(make_ti(-9223372036854775808, 0x0), -2, make_ti(0x4000000000000000, 0x0))
+  test__divti3(make_ti(-9223372036854775808, 0x0), 2, make_ti(-0x4000000000000000, 0x0))
+end
+
+describe "__modti3" do
+  test__modti3(0, 1, 0)
+  test__modti3(0, -1, 0)
+
+  test__modti3(5, 3, 2)
+  test__modti3(5, -3, 2)
+  test__modti3(-5, 3, -2)
+  test__modti3(-5, -3, -2)
+
+  test__modti3(make_ti(-9223372036854775808, 0x0), 1, 0)
+  test__modti3(make_ti(-9223372036854775808, 0x0), -1, 0)
+  test__modti3(make_ti(-9223372036854775808, 0x0), 2, 0)
+  test__modti3(make_ti(-9223372036854775808, 0x0), -2, 0)
+  test__modti3(make_ti(-9223372036854775808, 0x0), 3, -2)
+  test__modti3(make_ti(-9223372036854775808, 0x0), -3, -2)
+end
+
+describe "__udivti3" do
+  test__udivti3(0, 1, 0)
+  test__udivti3(2, 1, 2)
+
+  test__udivti3(make_tu(0x0, 0x8000000000000000), 1, make_tu(0x0, 0x8000000000000000))
+  test__udivti3(make_tu(0x0, 0x8000000000000000), 2, make_tu(0x0, 0x4000000000000000))
+  test__udivti3(make_tu(0xffffffffffffffff, 0xffffffffffffffff), 2, make_tu(0x7fffffffffffffff, 0xffffffffffffffff))
+end
+
+describe "__umodti3" do
+  test__umodti3(0, 1, 0)
+  test__umodti3(2, 1, 0)
+
+  test__umodti3(make_tu(0x0, 0x8000000000000000), 1, 0)
+  test__umodti3(make_tu(0x0, 0x8000000000000000), 2, 0)
+  test__umodti3(make_tu(0xffffffffffffffff, 0xffffffffffffffff), 2, 1)
+end
diff --git a/spec/std/crystal/compiler_rt/mulodi4_spec.cr b/spec/std/crystal/compiler_rt/mulodi4_spec.cr
index 3e2586cd5d6a..1c413bdb8e77 100644
--- a/spec/std/crystal/compiler_rt/mulodi4_spec.cr
+++ b/spec/std/crystal/compiler_rt/mulodi4_spec.cr
@@ -1,6 +1,6 @@
 require "spec"
 
-# Ported from compiler-rt:test/builtins/Unit/mulodi4_test.c
+# Ported from https://github.com/llvm/llvm-project/blob/ce59ccd04023cab3a837da14079ca2dcbfebb70c/compiler-rt/test/builtins/Unit/mulodi4_test.c
 
 private def test__mulodi4(a : Int64, b : Int64, expected : Int64, expected_overflow : Int32, file = __FILE__, line = __LINE__)
   it "passes compiler-rt builtins unit tests" do
diff --git a/spec/std/crystal/compiler_rt/mulosi4_spec.cr b/spec/std/crystal/compiler_rt/mulosi4_spec.cr
new file mode 100644
index 000000000000..e303ab4759b8
--- /dev/null
+++ b/spec/std/crystal/compiler_rt/mulosi4_spec.cr
@@ -0,0 +1,76 @@
+require "spec"
+
+# Ported from https://github.com/llvm/llvm-project/blob/ce59ccd04023cab3a837da14079ca2dcbfebb70c/compiler-rt/test/builtins/Unit/mulosi4_test.c
+
+private def test__mulosi4(a : Int32, b : Int32, expected : Int32, expected_overflow : Int32, file = __FILE__, line = __LINE__)
+  it "passes compiler-rt builtins unit tests" do
+    actual_overflow : Int32 = 0
+    actual = __mulosi4(a, b, pointerof(actual_overflow))
+    actual_overflow.should eq(expected_overflow), file: file, line: line
+    if !expected_overflow
+      actual.should eq(expected), file: file, line: line
+    end
+  end
+end
+
+describe "__mulosi4" do
+  test__mulosi4(0, 0, 0, 0)
+  test__mulosi4(0, 1, 0, 0)
+  test__mulosi4(1, 0, 0, 0)
+  test__mulosi4(0, 10, 0, 0)
+  test__mulosi4(10, 0, 0, 0)
+  test__mulosi4(0, 0x1234567, 0, 0)
+  test__mulosi4(0x1234567, 0, 0, 0)
+
+  test__mulosi4(0, -1, 0, 0)
+  test__mulosi4(-1, 0, 0, 0)
+  test__mulosi4(0, -10, 0, 0)
+  test__mulosi4(-10, 0, 0, 0)
+  test__mulosi4(0, 0x1234567, 0, 0)
+  test__mulosi4(0x1234567, 0, 0, 0)
+
+  test__mulosi4(1, 1, 1, 0)
+  test__mulosi4(1, 10, 10, 0)
+  test__mulosi4(10, 1, 10, 0)
+  test__mulosi4(1, 0x1234567, 0x1234567, 0)
+  test__mulosi4(0x1234567, 1, 0x1234567, 0)
+
+  test__mulosi4(1, -1, -1, 0)
+  test__mulosi4(1, -10, -10, 0)
+  test__mulosi4(-10, 1, -10, 0)
+  test__mulosi4(1, -0x1234567, -0x1234567, 0)
+  test__mulosi4(-0x1234567, 1, -0x1234567, 0)
+
+  test__mulosi4(0x7FFFFFFF, -2, -0x7fffffff, 1)
+  test__mulosi4(-2, 0x7FFFFFFF, -0x7fffffff, 1)
+  test__mulosi4(0x7FFFFFFF, -1, -0x7fffffff, 0)
+  test__mulosi4(-1, 0x7FFFFFFF, -0x7fffffff, 0)
+  test__mulosi4(0x7FFFFFFF, 0, 0, 0)
+  test__mulosi4(0, 0x7FFFFFFF, 0, 0)
+  test__mulosi4(0x7FFFFFFF, 1, 0x7FFFFFFF, 0)
+  test__mulosi4(1, 0x7FFFFFFF, 0x7FFFFFFF, 0)
+  test__mulosi4(0x7FFFFFFF, 2, -0x7fffffff, 1)
+  test__mulosi4(2, 0x7FFFFFFF, -0x7fffffff, 1)
+
+  test__mulosi4(-0x80000000, -2, -0x80000000, 1)
+  test__mulosi4(-2, -0x80000000, -0x80000000, 1)
+  test__mulosi4(-0x80000000, -1, -0x80000000, 1)
+  test__mulosi4(-1, -0x80000000, -0x80000000, 1)
+  test__mulosi4(-0x80000000, 0, 0, 0)
+  test__mulosi4(0, -0x80000000, 0, 0)
+  test__mulosi4(-0x80000000, 1, -0x80000000, 0)
+  test__mulosi4(1, -0x80000000, -0x80000000, 0)
+  test__mulosi4(-0x80000000, 2, -0x80000000, 1)
+  test__mulosi4(2, -0x80000000, -0x80000000, 1)
+
+  test__mulosi4(-0x7fffffff, -2, -0x7fffffff, 1)
+  test__mulosi4(-2, -0x7fffffff, -0x7fffffff, 1)
+  test__mulosi4(-0x7fffffff, -1, 0x7FFFFFFF, 0)
+  test__mulosi4(-1, -0x7fffffff, 0x7FFFFFFF, 0)
+  test__mulosi4(-0x7fffffff, 0, 0, 0)
+  test__mulosi4(0, -0x7fffffff, 0, 0)
+  test__mulosi4(-0x7fffffff, 1, -0x7fffffff, 0)
+  test__mulosi4(1, -0x7fffffff, -0x7fffffff, 0)
+  test__mulosi4(-0x7fffffff, 2, -0x80000000, 1)
+  test__mulosi4(2, -0x7fffffff, -0x80000000, 1)
+end
diff --git a/spec/std/crystal/compiler_rt/muloti4_spec.cr b/spec/std/crystal/compiler_rt/muloti4_spec.cr
new file mode 100644
index 000000000000..960193dbd6c6
--- /dev/null
+++ b/spec/std/crystal/compiler_rt/muloti4_spec.cr
@@ -0,0 +1,151 @@
+require "spec"
+
+# Ported from https://github.com/llvm/llvm-project/blob/ce59ccd04023cab3a837da14079ca2dcbfebb70c/compiler-rt/test/builtins/Unit/muloti4_test.c
+
+private def test__muloti4(a : Int128, b : Int128, expected : Int128, expected_overflow : Int32, file = __FILE__, line = __LINE__)
+  it "passes compiler-rt builtins unit tests" do
+    actual_overflow : Int32 = 0
+    actual = __muloti4(a, b, pointerof(actual_overflow))
+    actual_overflow.should eq(expected_overflow), file: file, line: line
+    if !expected_overflow
+      actual.should eq(expected), file: file, line: line
+    end
+  end
+end
+
+# TODO: Replace helper methods with literals once possible
+
+private def make_ti(a : Int128, b : Int128)
+  (a << 64) + b
+end
+
+describe "__muloti4" do
+  test__muloti4(0, 0, 0, 0)
+  test__muloti4(0, 1, 0, 0)
+  test__muloti4(1, 0, 0, 0)
+  test__muloti4(0, 10, 0, 0)
+  test__muloti4(10, 0, 0, 0)
+  test__muloti4(0, 81985529216486895, 0, 0)
+  test__muloti4(81985529216486895, 0, 0, 0)
+  test__muloti4(0, -1, 0, 0)
+  test__muloti4(-1, 0, 0, 0)
+  test__muloti4(0, -10, 0, 0)
+  test__muloti4(-10, 0, 0, 0)
+  test__muloti4(0, -81985529216486895, 0, 0)
+  test__muloti4(-81985529216486895, 0, 0, 0)
+  test__muloti4(1, 1, 1, 0)
+  test__muloti4(1, 10, 10, 0)
+  test__muloti4(10, 1, 10, 0)
+  test__muloti4(1, 81985529216486895, 81985529216486895, 0)
+  test__muloti4(81985529216486895, 1, 81985529216486895, 0)
+  test__muloti4(1, -1, -1, 0)
+  test__muloti4(1, -10, -10, 0)
+  test__muloti4(-10, 1, -10, 0)
+  test__muloti4(1, -81985529216486895, -81985529216486895, 0)
+  test__muloti4(-81985529216486895, 1, -81985529216486895, 0)
+  test__muloti4(3037000499, 3037000499, 9223372030926249001, 0)
+  test__muloti4(-3037000499, 3037000499, -9223372030926249001, 0)
+  test__muloti4(3037000499, -3037000499, -9223372030926249001, 0)
+  test__muloti4(-3037000499, -3037000499, 9223372030926249001, 0)
+  test__muloti4(4398046511103, 2097152, 9223372036852678656, 0)
+  test__muloti4(-4398046511103, 2097152, -9223372036852678656, 0)
+  test__muloti4(4398046511103, -2097152, -9223372036852678656, 0)
+  test__muloti4(-4398046511103, -2097152, 9223372036852678656, 0)
+  test__muloti4(2097152, 4398046511103, 9223372036852678656, 0)
+  test__muloti4(-2097152, 4398046511103, -9223372036852678656, 0)
+  test__muloti4(2097152, -4398046511103, -9223372036852678656, 0)
+  test__muloti4(-2097152, -4398046511103, 9223372036852678656, 0)
+  test__muloti4(make_ti(0x00000000000000B5, 0x04F333F9DE5BE000),
+    make_ti(0x0000000000000000, 0x00B504F333F9DE5B),
+    make_ti(0x7FFFFFFFFFFFF328, 0xDF915DA296E8A000), 0)
+  test__muloti4(make_ti(0x7FFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
+    -2,
+    make_ti(0x8000000000000000, 0x0000000000000001), 1)
+  test__muloti4(-2,
+    make_ti(0x7FFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
+    make_ti(0x8000000000000000, 0x0000000000000001), 1)
+  test__muloti4(make_ti(0x7FFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
+    -1,
+    make_ti(0x8000000000000000, 0x0000000000000001), 0)
+  test__muloti4(-1,
+    make_ti(0x7FFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
+    make_ti(0x8000000000000000, 0x0000000000000001), 0)
+  test__muloti4(make_ti(0x7FFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
+    0,
+    0, 0)
+  test__muloti4(0,
+    make_ti(0x7FFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
+    0, 0)
+  test__muloti4(make_ti(0x7FFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
+    1,
+    make_ti(0x7FFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), 0)
+  test__muloti4(1,
+    make_ti(0x7FFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
+    make_ti(0x7FFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), 0)
+  test__muloti4(make_ti(0x7FFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
+    2,
+    make_ti(0x8000000000000000, 0x0000000000000001), 1)
+  test__muloti4(2,
+    make_ti(0x7FFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF),
+    make_ti(0x8000000000000000, 0x0000000000000001), 1)
+  test__muloti4(make_ti(0x8000000000000000, 0x0000000000000000),
+    -2,
+    make_ti(0x8000000000000000, 0x0000000000000000), 1)
+  test__muloti4(-2,
+    make_ti(0x8000000000000000, 0x0000000000000000),
+    make_ti(0x8000000000000000, 0x0000000000000000), 1)
+  test__muloti4(make_ti(0x8000000000000000, 0x0000000000000000),
+    -1,
+    make_ti(0x8000000000000000, 0x0000000000000000), 1)
+  test__muloti4(-1,
+    make_ti(0x8000000000000000, 0x0000000000000000),
+    make_ti(0x8000000000000000, 0x0000000000000000), 1)
+  test__muloti4(make_ti(0x8000000000000000, 0x0000000000000000),
+    0,
+    0, 0)
+  test__muloti4(0,
+    make_ti(0x8000000000000000, 0x0000000000000000),
+    0, 0)
+  test__muloti4(make_ti(0x8000000000000000, 0x0000000000000000),
+    1,
+    make_ti(0x8000000000000000, 0x0000000000000000), 0)
+  test__muloti4(1,
+    make_ti(0x8000000000000000, 0x0000000000000000),
+    make_ti(0x8000000000000000, 0x0000000000000000), 0)
+  test__muloti4(make_ti(0x8000000000000000, 0x0000000000000000),
+    2,
+    make_ti(0x8000000000000000, 0x0000000000000000), 1)
+  test__muloti4(2,
+    make_ti(0x8000000000000000, 0x0000000000000000),
+    make_ti(0x8000000000000000, 0x0000000000000000), 1)
+  test__muloti4(make_ti(0x8000000000000000, 0x0000000000000001),
+    -2,
+    make_ti(0x8000000000000000, 0x0000000000000001), 1)
+  test__muloti4(-2,
+    make_ti(0x8000000000000000, 0x0000000000000001),
+    make_ti(0x8000000000000000, 0x0000000000000001), 1)
+  test__muloti4(make_ti(0x8000000000000000, 0x0000000000000001),
+    -1,
+    make_ti(0x7FFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), 0)
+  test__muloti4(-1,
+    make_ti(0x8000000000000000, 0x0000000000000001),
+    make_ti(0x7FFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF), 0)
+  test__muloti4(make_ti(0x8000000000000000, 0x0000000000000001),
+    0,
+    0, 0)
+  test__muloti4(0,
+    make_ti(0x8000000000000000, 0x0000000000000001),
+    0, 0)
+  test__muloti4(make_ti(0x8000000000000000, 0x0000000000000001),
+    1,
+    make_ti(0x8000000000000000, 0x0000000000000001), 0)
+  test__muloti4(1,
+    make_ti(0x8000000000000000, 0x0000000000000001),
+    make_ti(0x8000000000000000, 0x0000000000000001), 0)
+  test__muloti4(make_ti(0x8000000000000000, 0x0000000000000001),
+    2,
+    make_ti(0x8000000000000000, 0x0000000000000000), 1)
+  test__muloti4(2,
+    make_ti(0x8000000000000000, 0x0000000000000001),
+    make_ti(0x8000000000000000, 0x0000000000000000), 1)
+end
diff --git a/spec/std/int_spec.cr b/spec/std/int_spec.cr
index e2aed420ed7d..54d0caf5a228 100644
--- a/spec/std/int_spec.cr
+++ b/spec/std/int_spec.cr
@@ -187,7 +187,14 @@ describe "Int" do
       it_converts_to_s 255_u8, "255"
       it_converts_to_s 65535_u16, "65535"
       it_converts_to_s 4294967295_u32, "4294967295"
+
       it_converts_to_s 18446744073709551615_u64, "18446744073709551615"
+
+      {% unless flag?(:win32) %}
+        it_converts_to_s UInt128::MAX, "340282366920938463463374607431768211455"
+        it_converts_to_s Int128::MAX, "170141183460469231731687303715884105727"
+        it_converts_to_s Int128::MIN, "-170141183460469231731687303715884105728"
+      {% end %}
     end
 
     context "base and upcase parameters" do
@@ -468,6 +475,9 @@ describe "Int" do
       Int64.new(1).should be_a(Int64)
       Int64.new(1).should eq(1)
 
+      Int128.new(1).should be_a(Int128)
+      Int128.new(1).should eq(1)
+
       UInt8.new(1).should be_a(UInt8)
       UInt8.new(1).should eq(1)
 
@@ -479,6 +489,9 @@ describe "Int" do
 
       UInt64.new(1).should be_a(UInt64)
       UInt64.new(1).should eq(1)
+
+      UInt128.new(1).should be_a(UInt128)
+      UInt128.new(1).should eq(1)
     end
   end
 
@@ -507,6 +520,10 @@ describe "Int" do
 
       (UInt8::MIN / -1).should eq(0)
     end
+
+    pending_win32 "divides Int128::MIN by -1" do
+      (Int128::MIN / -1).should eq(-(Int128::MIN.to_f64))
+    end
   end
 
   describe "floor division //" do
@@ -518,6 +535,16 @@ describe "Int" do
       {% end %}
     end
 
+    # Missing symbols: __floattidf, __floatuntidf, __fixdfti, __fixsfti, __fixunsdfti, __fixunssfti, __floatuntisf, __floattisf
+    # These symbols are all required to convert U/Int128s to Floats
+    pending_win32 "preserves type of lhs (128-bit)" do
+      {% for type in [UInt128, Int128] %}
+        ({{type}}.new(7) // 2).should be_a({{type}})
+        ({{type}}.new(7) // 2.0).should be_a({{type}})
+        ({{type}}.new(7) // 2.0_f32).should be_a({{type}})
+      {% end %}
+    end
+
     it "divides negative numbers" do
       (7 // 2).should eq(3)
       (-7 // 2).should eq(-4)
@@ -558,6 +585,7 @@ describe "Int" do
     expect_raises(ArgumentError) { Int16::MIN // -1 }
     expect_raises(ArgumentError) { Int32::MIN // -1 }
     expect_raises(ArgumentError) { Int64::MIN // -1 }
+    expect_raises(ArgumentError) { Int128::MIN // -1 }
 
     (UInt8::MIN // -1).should eq(0)
   end
@@ -583,8 +611,8 @@ describe "Int" do
   end
 
   it "returns 0 when doing IntN::MIN % -1 (#8306)" do
-    {% for n in [8, 16, 32, 64] %}
-      (Int{{n}}::MIN % -1_i{{n}}).should eq(0)
+    {% for n in [8, 16, 32, 64, 128] %}
+      (Int{{n}}::MIN % -1.to_i{{n}}).should eq(0)
     {% end %}
   end
 
@@ -597,8 +625,8 @@ describe "Int" do
   end
 
   it "returns 0 when doing IntN::MIN.remainder(-1) (#8306)" do
-    {% for n in [8, 16, 32, 64] %}
-      (Int{{n}}::MIN.remainder(-1_i{{n}})).should eq(0)
+    {% for n in [8, 16, 32, 64, 128] %}
+      (Int{{n}}::MIN.remainder(-1.to_i{{n}})).should eq(0)
     {% end %}
   end
 
@@ -734,27 +762,31 @@ describe "Int" do
     it { 5_i64.popcount.should eq(2) }
     it { 9223372036854775807_i64.popcount.should eq(63) }
     it { 18446744073709551615_u64.popcount.should eq(64) }
+
+    it { 0_i128.popcount.should eq(0) }
+    it { Int128::MAX.popcount.should eq(127) }
+    it { UInt128::MAX.popcount.should eq(128) }
   end
 
   describe "#leading_zeros_count" do
-    {% for width in %w(8 16 32 64).map(&.id) %}
-      it { -1_i{{width}}.leading_zeros_count.should eq(0) }
-      it { 0_i{{width}}.leading_zeros_count.should eq({{width}}) }
-      it { 0_u{{width}}.leading_zeros_count.should eq({{width}}) }
+    {% for width in %w(8 16 32 64 128).map(&.id) %}
+      it { -1.to_i{{width}}.leading_zeros_count.should eq(0) }
+      it { 0.to_i{{width}}.leading_zeros_count.should eq({{width}}) }
+      it { 0.to_u{{width}}.leading_zeros_count.should eq({{width}}) }
     {% end %}
   end
 
   describe "#trailing_zeros_count" do
-    {% for width in %w(8 16 32 64).map(&.id) %}
-      it { -2_i{{width}}.trailing_zeros_count.should eq(1) }
-      it { 2_i{{width}}.trailing_zeros_count.should eq(1) }
-      it { 2_u{{width}}.trailing_zeros_count.should eq(1) }
+    {% for width in %w(8 16 32 64 128).map(&.id) %}
+      it { -2.to_i{{width}}.trailing_zeros_count.should eq(1) }
+      it { 2.to_i{{width}}.trailing_zeros_count.should eq(1) }
+      it { 2.to_u{{width}}.trailing_zeros_count.should eq(1) }
     {% end %}
   end
 
   pending_win32 "compares signed vs. unsigned integers" do
-    signed_ints = [Int8::MAX, Int16::MAX, Int32::MAX, Int64::MAX, Int8::MIN, Int16::MIN, Int32::MIN, Int64::MIN, 0_i8, 0_i16, 0_i32, 0_i64]
-    unsigned_ints = [UInt8::MAX, UInt16::MAX, UInt32::MAX, UInt64::MAX, 0_u8, 0_u16, 0_u32, 0_u64]
+    signed_ints = [Int8::MAX, Int16::MAX, Int32::MAX, Int64::MAX, Int128::MAX, Int8::MIN, Int16::MIN, Int32::MIN, Int64::MIN, Int128::MIN, 0_i8, 0_i16, 0_i32, 0_i64, 0_i128]
+    unsigned_ints = [UInt8::MAX, UInt16::MAX, UInt32::MAX, UInt64::MAX, UInt128::MAX, 0_u8, 0_u16, 0_u32, 0_u64, 0_u128]
 
     big_signed_ints = signed_ints.map &.to_big_i
     big_unsigned_ints = unsigned_ints.map &.to_big_i
@@ -781,7 +813,7 @@ describe "Int" do
   end
 
   it "clones" do
-    [1_u8, 2_u16, 3_u32, 4_u64, 5_i8, 6_i16, 7_i32, 8_i64].each do |value|
+    [1_u8, 2_u16, 3_u32, 4_u64, 5.to_u128, 6_i8, 7_i16, 8_i32, 9_i64, 10.to_i128].each do |value|
       value.clone.should eq(value)
     end
   end
@@ -841,6 +873,12 @@ describe "Int" do
       UInt64::MAX.digits.should eq(UInt64::MAX.to_s.chars.map(&.to_i).reverse)
     end
 
+    # Missing symbol __floatuntidf on windows
+    pending_win32 "works for u/int128 maximums" do
+      Int128::MAX.digits.should eq(Int128::MAX.to_s.chars.map(&.to_i).reverse)
+      UInt128::MAX.digits.should eq(UInt128::MAX.to_s.chars.map(&.to_i).reverse)
+    end
+
     it "works for non-Int32" do
       digits = 123_i64.digits
       digits.should eq([3, 2, 1])
diff --git a/spec/std/uint_spec.cr b/spec/std/uint_spec.cr
index 7de658f63b70..bd69c6ed3974 100644
--- a/spec/std/uint_spec.cr
+++ b/spec/std/uint_spec.cr
@@ -48,6 +48,14 @@ describe "UInt" do
       x = &-18446744073709551615_u64
       x.should eq(1_u64)
       x.should be_a(UInt64)
+
+      x = &-1_u128
+      x.should eq(UInt128::MAX) # TODO: Change to literal once supported
+      x.should be_a(UInt128)
+
+      x = &-(UInt128::MAX) # TODO: Change to literal once supported
+      x.should eq(1_u128)
+      x.should be_a(UInt128)
     end
   end
 end
diff --git a/spec/win32_std_spec.cr b/spec/win32_std_spec.cr
index e311e87fe8f3..1f0898f6518d 100644
--- a/spec/win32_std_spec.cr
+++ b/spec/win32_std_spec.cr
@@ -34,7 +34,9 @@ require "./std/crypto/bcrypt/password_spec.cr"
 require "./std/crypto/bcrypt_spec.cr"
 require "./std/crypto/blowfish_spec.cr"
 require "./std/crypto/subtle_spec.cr"
+# require "./std/crystal/compiler_rt/muloti4_spec.cr" (failed to run)
 require "./std/crystal/compiler_rt/mulodi4_spec.cr"
+require "./std/crystal/compiler_rt/mulosi4_spec.cr"
 require "./std/crystal/digest/md5_spec.cr"
 require "./std/crystal/digest/sha1_spec.cr"
 require "./std/crystal/hasher_spec.cr"
diff --git a/src/crystal/compiler_rt.cr b/src/crystal/compiler_rt.cr
index d52e22ba6ebd..e3a557ca595d 100644
--- a/src/crystal/compiler_rt.cr
+++ b/src/crystal/compiler_rt.cr
@@ -1,3 +1,4 @@
 {% skip_file if flag?(:skip_crystal_compiler_rt) %}
 
-require "./compiler_rt/mulodi4.cr"
+require "./compiler_rt/mul.cr"
+require "./compiler_rt/divmod128.cr"
diff --git a/src/crystal/compiler_rt/divmod128.cr b/src/crystal/compiler_rt/divmod128.cr
new file mode 100644
index 000000000000..91f59f4664c6
--- /dev/null
+++ b/src/crystal/compiler_rt/divmod128.cr
@@ -0,0 +1,205 @@
+# This file includes an implementation of (U)Int128 modulo/division operations
+
+# :nodoc:
+fun __divti3(a : Int128, b : Int128) : Int128
+  # Ported from https://github.com/llvm/llvm-project/blob/ce59ccd04023cab3a837da14079ca2dcbfebb70c/compiler-rt/lib/builtins/int_div_impl.inc
+
+  s_a = a >> 127       # s_a = a < 0 ? -1 : 0
+  s_b = b >> 127       # s_b = b < 0 ? -1 : 0
+  a = (a ^ s_a) &- s_a # negate if s_a == -1
+  b = (b ^ s_b) &- s_b # negate if s_b == -1
+  s_a ^= s_b           # sign of quotient
+  quo, _ = _u128_div_rem(a.to_u128!, b.to_u128!)
+  ((quo ^ s_a) &- s_a).to_i128! # negate if s_a == -1
+end
+
+# :nodoc:
+fun __modti3(a : Int128, b : Int128) : Int128
+  # Ported from https://github.com/llvm/llvm-project/blob/ce59ccd04023cab3a837da14079ca2dcbfebb70c/compiler-rt/lib/builtins/int_div_impl.inc
+
+  s = b >> 127     # s = b < 0 ? -1 : 0
+  b = (b ^ s) &- s # negate if s == -1
+  s = a >> 127     # s = a < 0 ? -1 : 0
+  a = (a ^ s) &- s # negate if s == -1
+  _, rem = _u128_div_rem(a.to_u128!, b.to_u128!)
+  (rem.to_i128! ^ s) &- s # negate if s == -1
+end
+
+# :nodoc:
+fun __udivti3(a : UInt128, b : UInt128) : UInt128
+  # Ported from https://github.com/llvm/llvm-project/blob/ce59ccd04023cab3a837da14079ca2dcbfebb70c/compiler-rt/lib/builtins/int_div_impl.inc
+
+  quo, _ = _u128_div_rem(a, b)
+  quo
+end
+
+# :nodoc:
+fun __umodti3(a : UInt128, b : UInt128) : UInt128
+  # Ported from https://github.com/llvm/llvm-project/blob/ce59ccd04023cab3a837da14079ca2dcbfebb70c/compiler-rt/lib/builtins/int_div_impl.inc
+
+  _, rem = _u128_div_rem(a, b)
+  rem
+end
+
+# :nodoc:
+def _carrying_mul(lhs : UInt64, rhs : UInt64) : Tuple(UInt64, UInt64)
+  # Ported from https://github.com/rust-lang/compiler-builtins/blob/2be2bc086bd9b3c0fc8eb8d2dc7df025e6ffd318/src/int/specialized_div_rem/trifecta.rs
+
+  tmp = lhs.to_u128! &* rhs.to_u128!
+  {tmp.to_u64!, (tmp >> 64).to_u64!}
+end
+
+# :nodoc:
+def _carrying_mul_add(lhs : UInt64, mul : UInt64, add : UInt64) : Tuple(UInt64, UInt64)
+  # Ported from https://github.com/rust-lang/compiler-builtins/blob/2be2bc086bd9b3c0fc8eb8d2dc7df025e6ffd318/src/int/specialized_div_rem/trifecta.rs
+
+  tmp = lhs.to_u128!
+  tmp &*= mul.to_u128!
+  tmp &+= add.to_u128!
+  {tmp.to_u64!, (tmp >> 64).to_u64!}
+end
+
+# :nodoc:
+def _u128_div_rem(duo : UInt128, div : UInt128) : Tuple(UInt128, UInt128)
+  # Ported from https://github.com/rust-lang/compiler-builtins/blob/2be2bc086bd9b3c0fc8eb8d2dc7df025e6ffd318/src/int/specialized_div_rem/trifecta.rs
+
+  # Rust also has another algorithm for 128-bit integer division
+  # for microarchitectures that have slow hardware integer division.
+
+  # This algorithm is called the trifecta algorithm because it uses three main algorithms:
+  # - short division for small divisors
+  # - the two possibility algorithm for large divisors
+  # - an undersubtracting long division algorithm for intermediate cases
+
+  div_lz = div.leading_zeros_count
+  duo_lz = duo.leading_zeros_count
+
+  if div_lz <= duo_lz
+    # Resulting quotient is 0 or 1 at this point
+    # The highest set bit of `duo` needs to be at least one place higher than `div` for the quotient to be more than one.
+    if duo >= div
+      return {1_u128, duo - div}
+    else
+      return {0_u128, duo}
+    end
+  end
+
+  # Use 64-bit integer division if possible
+  if duo_lz >= 64
+    # duo fits in a 64-bit integer
+    # Because of the previous branch (div_lz <= duo_lz), div will also fit in an 64-bit integer
+    quo_local1 = duo.to_u64! // div.to_u64!
+    rem_local1 = duo.to_u64! % div.to_u64!
+    return {quo_local1.to_u128!, rem_local1.to_u128!}
+  end
+
+  # Short division branch
+  if div_lz >= 96
+    duo_hi = (duo >> 64).to_u64!
+    div_0 = div.to_u32!.to_u64!
+    quo_hi = duo_hi // div_0
+    rem_3 = duo_hi % div_0
+
+    duo_mid = (duo >> 32).to_u32!.to_u64! | (rem_3 << 32)
+    quo_1 = duo_mid // div_0
+    rem_2 = duo_mid % div_0
+
+    duo_lo = duo.to_u32!.to_u64! | (rem_2 << 32)
+    quo_0 = duo_lo // div_0
+    rem_1 = duo_lo % div_0
+
+    return {quo_0.to_u128! | (quo_1.to_u128! << 32) | (quo_hi.to_u128! << 64), rem_1.to_u128!}
+  end
+
+  # Relative leading significant bits (cannot overflow because of above branches)
+  lz_diff = div_lz - duo_lz
+
+  if lz_diff < 32
+    # Two possibility division algorithm
+
+    # The most significant bits of duo and div are within 32 bits of each other.
+    # If we take the n most significant bits of duo and divide them by the corresponding bits in div, it produces the quotient value quo.
+    # It happens that quo or quo - 1 will always be the correct quotient for the whole number.
+
+    shift = 64 - duo_lz
+    duo_sig_n = (duo >> shift).to_u64!
+    div_sig_n = (div >> shift).to_u64!
+    quo_local2 = duo_sig_n // div_sig_n
+
+    # The larger quo can overflow, so a manual carrying mul is used with manual overflow checking.
+    div_lo = div.to_u64!
+    div_hi = (div >> 64).to_u64!
+    tmp_lo, carry = _carrying_mul(quo_local2, div_lo)
+    tmp_hi, overflow = _carrying_mul_add(quo_local2, div_hi, carry)
+    tmp = tmp_lo.to_u128! | (tmp_hi.to_u128! << 64)
+    if (overflow != 0) || (duo < tmp)
+      # In `duo &+ div &- tmp`, both the subtraction and addition can overflow, but the result is always a correct positive number.
+      return {(quo_local2 - 1).to_u128!, duo &+ div &- tmp}
+    else
+      return {quo_local2.to_u128!, duo - tmp}
+    end
+  end
+
+  # Undersubtracting long division algorithm.
+
+  quo : UInt128 = 0
+  div_extra = 96 - div_lz                  # Number of lesser significant bits that aren't part of div_sig_32
+  div_sig_32 = (div >> div_extra).to_u32!  # Most significant 32 bits of div
+  div_sig_32_add1 = div_sig_32.to_u64! + 1 # This must be a UInt64 because this can overflow
+
+  loop do
+    duo_extra = 64 - duo_lz                # Number of lesser significant bits that aren't part of duo_sig_n
+    duo_sig_n = (duo >> duo_extra).to_u64! # Most significant 64 bits of duo
+
+    # The two possibility algorithm requires that the difference between most significant bits is less than 32
+    if div_extra <= duo_extra
+      # Undersubtracting long division step
+      quo_part = (duo_sig_n // div_sig_32_add1).to_u128!
+      extra_shl = duo_extra - div_extra
+
+      # Addition to the quotient
+      quo += (quo_part << extra_shl)
+
+      # Subtraction from duo. At least 31 bits are cleared from duo here
+      duo -= ((div &* quo_part) << extra_shl)
+    else
+      # Two possibility algorithm
+
+      shift = 64 - duo_lz
+      duo_sig_n = (duo >> shift).to_u64!
+      div_sig_n = (div >> shift).to_u64!
+      quo_part = duo_sig_n // div_sig_n
+      div_lo = div.to_u64!
+      div_hi = (div >> 64).to_u64!
+
+      tmp_lo, carry = _carrying_mul(quo_part, div_lo)
+      # The undersubtracting long division algorithm has already run once, so overflow beyond 128 bits is impossible
+      tmp_hi, _ = _carrying_mul_add(quo_part, div_hi, carry)
+      tmp = tmp_lo.to_u128! | (tmp_hi.to_u128! << 64)
+
+      if duo < tmp
+        return {quo + (quo_part - 1), duo &+ div &- tmp}
+      else
+        return {quo + quo_part, duo - tmp}
+      end
+    end
+
+    duo_lz = duo.leading_zeros_count
+
+    if div_lz <= duo_lz
+      # Quotient can have 0 or 1 added to it
+      if div <= duo
+        return {quo + 1, duo - div}
+      else
+        return {quo, duo}
+      end
+    end
+
+    # This can only happen if div_sd < 64
+    if 64 <= duo_lz
+      quo_local3 = duo.to_u64! // div.to_u64!
+      rem_local2 = duo.to_u64! % div.to_u64!
+      return {quo + quo_local3, rem_local2.to_u128!}
+    end
+  end
+end
diff --git a/src/crystal/compiler_rt/mul.cr b/src/crystal/compiler_rt/mul.cr
new file mode 100644
index 000000000000..1d4604225fde
--- /dev/null
+++ b/src/crystal/compiler_rt/mul.cr
@@ -0,0 +1,42 @@
+# :nodoc:
+private macro __mul_impl(name, type, n)
+  # Ported from https://github.com/llvm/llvm-project/blob/ce59ccd04023cab3a837da14079ca2dcbfebb70c/compiler-rt/lib/builtins/int_mulo_impl.inc
+  # :nodoc:
+  fun {{name}}(a : {{type}}, b : {{type}}, overflow : Int32*) : {{type}}
+    overflow.value = 0
+    result = a &* b
+    if a == {{type}}::MIN
+      if b != 0 && b != 1
+        overflow.value = 1
+      end
+      return result
+    end
+    if b == {{type}}::MIN
+      if a != 0 && a != 1
+        overflow.value = 1
+      end
+      return result
+    end
+    sa = a >> {{n - 1}}
+    abs_a = (a ^ sa) &- sa
+    sb = b >> {{n - 1}}
+    abs_b = (b ^ sb) &- sb
+    if abs_a < 2 || abs_b < 2
+      return result
+    end
+    if sa == sb
+      if abs_a > ({{type}}::MAX // abs_b)
+        overflow.value = 1
+      end
+    else
+      if abs_a > ({{type}}::MIN // ({{type}}.new(0) &- abs_b))
+        overflow.value = 1
+      end
+    end
+    return result
+  end
+end
+
+__mul_impl(__mulosi4, Int32, 32)
+__mul_impl(__mulodi4, Int64, 64)
+__mul_impl(__muloti4, Int128, 128)
diff --git a/src/crystal/compiler_rt/mulodi4.cr b/src/crystal/compiler_rt/mulodi4.cr
deleted file mode 100644
index e853b89b13aa..000000000000
--- a/src/crystal/compiler_rt/mulodi4.cr
+++ /dev/null
@@ -1,37 +0,0 @@
-# :nodoc:
-fun __mulodi4(a : Int64, b : Int64, overflow : Int32*) : Int64
-  n = 64
-  min = Int64::MIN
-  max = Int64::MAX
-  overflow.value = 0
-  result = a &* b
-  if a == min
-    if b != 0 && b != 1
-      overflow.value = 1
-    end
-    return result
-  end
-  if b == min
-    if a != 0 && a != 1
-      overflow.value = 1
-    end
-    return result
-  end
-  sa = a >> (n &- 1)
-  abs_a = (a ^ sa) &- sa
-  sb = b >> (n &- 1)
-  abs_b = (b ^ sb) &- sb
-  if abs_a < 2 || abs_b < 2
-    return result
-  end
-  if sa == sb
-    if abs_a > max // abs_b
-      overflow.value = 1
-    end
-  else
-    if abs_a > min // (0i64 &- abs_b)
-      overflow.value = 1
-    end
-  end
-  return result
-end