From 154f08c575917a0eadc194f2472e8c8a815b82e6 Mon Sep 17 00:00:00 2001
From: Kristoffer Carlsson <kcarlsson89@gmail.com>
Date: Thu, 13 Feb 2020 16:30:31 +0100
Subject: [PATCH] add functions for doing saturated adds and subs

---
 src/LLVM_intrinsics.jl |  31 ++++++++----
 src/simdvec.jl         | 111 +++++++++++++++++++++--------------------
 test/runtests.jl       |  33 ++++++++++++
 3 files changed, 112 insertions(+), 63 deletions(-)

diff --git a/src/LLVM_intrinsics.jl b/src/LLVM_intrinsics.jl
index 2807968..8c3f172 100644
--- a/src/LLVM_intrinsics.jl
+++ b/src/LLVM_intrinsics.jl
@@ -45,9 +45,10 @@ suffix(N::Integer, ::Type{Ptr{T}}) where {T} = "v$(N)p0$(T<:IntegerTypes ? "i" :
 suffix(N::Integer, ::Type{T}) where {T}      = "v$(N)$(T<:IntegerTypes   ? "i" : "f")$(8*sizeof(T))"
 suffix(::Type{T}) where {T}                  = "$(T<:IntegerTypes        ? "i" : "f")$(8*sizeof(T))"
 
-llvm_name(llvmf, N, T)                           = string("llvm", ".", llvmf, ".", suffix(N, T))
-llvm_name(llvmf, ::Type{LVec{N, T}}) where {N,T} = string("llvm", ".", llvmf, ".", suffix(N, T))
-llvm_name(llvmf, ::Type{T}) where {T}            = string("llvm", ".", llvmf, ".", suffix(T))
+dotit(f) = replace(string(f), "_" => ".")
+llvm_name(llvmf, N, T)                           = string("llvm", ".", dotit(llvmf), ".", suffix(N, T))
+llvm_name(llvmf, ::Type{LVec{N, T}}) where {N,T} = string("llvm", ".", dotit(llvmf), ".", suffix(N, T))
+llvm_name(llvmf, ::Type{T}) where {T}            = string("llvm", ".", dotit(llvmf), ".", suffix(T))
 
 llvm_type(::Type{T}) where {T}            = d[T]
 llvm_type(::Type{LVec{N, T}}) where {N,T} = "< $N x $(d[T])>"
@@ -171,13 +172,23 @@ const BINARY_INTRINSICS_FLOAT = [
     :round
 ]
 
-for f in BINARY_INTRINSICS_FLOAT
-    @eval @generated function $(f)(x::T, y::T) where T<:LT{<:FloatingTypes}
-        ff = llvm_name($(QuoteNode(f)), T,)
-        return :(
-            $(Expr(:meta, :inline));
-            ccall($ff, llvmcall, T, (T, T), x, y)
-        )
+const BINARY_INTRINSICS_INT = [
+    :sadd_sat
+    :uadd_sat
+    :ssub_sat
+    :usub_sat
+]
+
+for (fs, c) in zip([BINARY_INTRINSICS_FLOAT, BINARY_INTRINSICS_INT],
+                   [FloatingTypes,           IntegerTypes])
+    for f in fs
+        @eval @generated function $(f)(x::T, y::T) where T<:LT{<:$c}
+            ff = llvm_name($(QuoteNode(f)), T,)
+            return :(
+                $(Expr(:meta, :inline));
+                ccall($ff, llvmcall, T, (T, T), x, y)
+            )
+        end
     end
 end
 
diff --git a/src/simdvec.jl b/src/simdvec.jl
index a09ca7f..6932b59 100644
--- a/src/simdvec.jl
+++ b/src/simdvec.jl
@@ -177,50 +177,54 @@ end
 ####################
 
 const BINARY_OPS = [
-    (:+        , IntegerTypes  , Intrinsics.add)
-    (:-        , IntegerTypes  , Intrinsics.sub)
-    (:*        , IntegerTypes  , Intrinsics.mul)
-    (:div      , UIntTypes     , Intrinsics.udiv)
-    (:div      , IntTypes      , Intrinsics.sdiv)
-    (:rem      , UIntTypes     , Intrinsics.urem)
-    (:rem      , IntTypes      , Intrinsics.srem)
-
-    (:+        , FloatingTypes , Intrinsics.fadd)
-    (:-        , FloatingTypes , Intrinsics.fsub)
-    (:*        , FloatingTypes , Intrinsics.fmul)
-    (:^        , FloatingTypes , Intrinsics.pow)
-    (:/        , FloatingTypes , Intrinsics.fdiv)
-    (:rem      , FloatingTypes , Intrinsics.frem)
-    (:min      , FloatingTypes , Intrinsics.minnum)
-    (:max      , FloatingTypes , Intrinsics.maxnum)
-    (:copysign , FloatingTypes , Intrinsics.copysign)
-
-    (:~        , BIntegerTypes  , Intrinsics.xor)
-    (:&        , BIntegerTypes  , Intrinsics.and)
-    (:|        , BIntegerTypes  , Intrinsics.or)
-    (:⊻        , BIntegerTypes  , Intrinsics.xor)
-
-    (:(==)     , BIntegerTypes  , Intrinsics.icmp_eq)
-    (:(!=)     , BIntegerTypes  , Intrinsics.icmp_ne)
-    (:(>)      , BIntTypes      , Intrinsics.icmp_sgt)
-    (:(>=)     , BIntTypes      , Intrinsics.icmp_sge)
-    (:(<)      , BIntTypes      , Intrinsics.icmp_slt)
-    (:(<=)     , BIntTypes      , Intrinsics.icmp_sle)
-    (:(>)      , UIntTypes      , Intrinsics.icmp_ugt)
-    (:(>=)     , UIntTypes      , Intrinsics.icmp_uge)
-    (:(<)      , UIntTypes      , Intrinsics.icmp_ult)
-    (:(<=)     , UIntTypes      , Intrinsics.icmp_ule)
-
-    (:(==)     , FloatingTypes , Intrinsics.fcmp_oeq)
-    (:(!=)     , FloatingTypes , Intrinsics.fcmp_une)
-    (:(>)      , FloatingTypes , Intrinsics.fcmp_ogt)
-    (:(>=)     , FloatingTypes , Intrinsics.fcmp_oge)
-    (:(<)      , FloatingTypes , Intrinsics.fcmp_olt)
-    (:(<=)     , FloatingTypes , Intrinsics.fcmp_ole)
+    (:(Base.:+)        , IntegerTypes  , Intrinsics.add)
+    (:(Base.:-)        , IntegerTypes  , Intrinsics.sub)
+    (:(Base.:*)        , IntegerTypes  , Intrinsics.mul)
+    (:(Base.div)       , UIntTypes     , Intrinsics.udiv)
+    (:(Base.div)       , IntTypes      , Intrinsics.sdiv)
+    (:(Base.rem)       , UIntTypes     , Intrinsics.urem)
+    (:(Base.rem)       , IntTypes      , Intrinsics.srem)
+
+    (:(add_saturate) , IntTypes  , Intrinsics.sadd_sat)
+    (:(add_saturate) , UIntTypes , Intrinsics.uadd_sat)
+    (:(sub_saturate) , IntTypes  , Intrinsics.ssub_sat)
+    (:(sub_saturate) , UIntTypes , Intrinsics.usub_sat)
+
+    (:(Base.:+)        , FloatingTypes , Intrinsics.fadd)
+    (:(Base.:-)        , FloatingTypes , Intrinsics.fsub)
+    (:(Base.:*)        , FloatingTypes , Intrinsics.fmul)
+    (:(Base.:^)        , FloatingTypes , Intrinsics.pow)
+    (:(Base.:/)        , FloatingTypes , Intrinsics.fdiv)
+    (:(Base.rem)       , FloatingTypes , Intrinsics.frem)
+    (:(Base.min)       , FloatingTypes , Intrinsics.minnum)
+    (:(Base.max)       , FloatingTypes , Intrinsics.maxnum)
+    (:(Base.copysign)  , FloatingTypes , Intrinsics.copysign)
+    (:(Base.:~)        , BIntegerTypes , Intrinsics.xor)
+    (:(Base.:&)        , BIntegerTypes , Intrinsics.and)
+    (:(Base.:|)        , BIntegerTypes , Intrinsics.or)
+    (:(Base.:⊻)        , BIntegerTypes , Intrinsics.xor)
+
+    (:(Base.:(==))   , BIntegerTypes  , Intrinsics.icmp_eq)
+    (:(Base.:!=)     , BIntegerTypes  , Intrinsics.icmp_ne)
+    (:(Base.:>)      , BIntTypes      , Intrinsics.icmp_sgt)
+    (:(Base.:>=)     , BIntTypes      , Intrinsics.icmp_sge)
+    (:(Base.:<)      , BIntTypes      , Intrinsics.icmp_slt)
+    (:(Base.:<=)     , BIntTypes      , Intrinsics.icmp_sle)
+    (:(Base.:>)      , UIntTypes      , Intrinsics.icmp_ugt)
+    (:(Base.:>=)     , UIntTypes      , Intrinsics.icmp_uge)
+    (:(Base.:<)      , UIntTypes      , Intrinsics.icmp_ult)
+    (:(Base.:<=)     , UIntTypes      , Intrinsics.icmp_ule)
+
+    (:(Base.:(==))   , FloatingTypes , Intrinsics.fcmp_oeq)
+    (:(Base.:!=)     , FloatingTypes , Intrinsics.fcmp_une)
+    (:(Base.:>)      , FloatingTypes , Intrinsics.fcmp_ogt)
+    (:(Base.:>=)     , FloatingTypes , Intrinsics.fcmp_oge)
+    (:(Base.:<)      , FloatingTypes , Intrinsics.fcmp_olt)
+    (:(Base.:<=)     , FloatingTypes , Intrinsics.fcmp_ole)
 ]
 
 for (op, constraint, llvmop) in BINARY_OPS
-    @eval @inline function (Base.$op)(x::Vec{N, T}, y::Vec{N, T}) where {N, T <: $constraint}
+    @eval @inline function $op(x::Vec{N, T}, y::Vec{N, T}) where {N, T <: $constraint}
         Vec($(llvmop)(x.data, y.data))
     end
 end
@@ -317,22 +321,23 @@ for v in (:<<, :>>, :>>>)
     end
 end
 
+
 # Vectorize binary functions
 for (op, constraint) in [BINARY_OPS;
-        (:flipsign , ScalarTypes)
-        (:copysign , ScalarTypes)
-        (:signbit  , ScalarTypes)
-        (:min      , IntegerTypes)
-        (:max      , IntegerTypes)
-        (:<<       , IntegerTypes)
-        (:>>       , IntegerTypes)
-        (:>>>      , IntegerTypes)
+        (:(Base.flipsign) , ScalarTypes)
+        (:(Base.copysign) , ScalarTypes)
+        (:(Base.signbit)  , ScalarTypes)
+        (:(Base.min)      , IntegerTypes)
+        (:(Base.max)      , IntegerTypes)
+        (:(Base.:<<)      , IntegerTypes)
+        (:(Base.:>>)      , IntegerTypes)
+        (:(Base.:>>>)     , IntegerTypes)
     ]
-    @eval @inline function (Base.$op)(x::T2, y::Vec{N, T}) where {N, T2<:ScalarTypes, T <: $constraint}
-        Base.$op(Vec{N, T}(x), y)
+    @eval @inline function $op(x::T2, y::Vec{N, T}) where {N, T2<:ScalarTypes, T <: $constraint}
+        $op(Vec{N, T}(x), y)
     end
-    @eval @inline function (Base.$op)(x::Vec{N, T}, y::T2) where {N, T2 <:ScalarTypes, T <: $constraint}
-        Base.$op(x, Vec{N, T}(y))
+    @eval @inline function $op(x::Vec{N, T}, y::T2) where {N, T2 <:ScalarTypes, T <: $constraint}
+        $op(x, Vec{N, T}(y))
     end
 end
 
diff --git a/test/runtests.jl b/test/runtests.jl
index 74bb8f3..7d3c53c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -130,6 +130,16 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
         @test Tuple(V8I32(v8i32)^3) === v8i32.^3
     end
 
+    @testset "saturation" begin
+        v = Vec{4, UInt8}(UInt8.((150, 250, 125, 0)))
+        @test SIMD.add_saturate(v, UInt8(50)) === Vec{4, UInt8}(UInt8.((200, 255, 175, 50)))
+        @test SIMD.sub_saturate(v, UInt8(100)) === Vec{4, UInt8}(UInt8.((50, 150, 25, 0)))
+        v = Vec{4, Int8}(Int8.((100, -100, 20, -20)))
+        @test SIMD.add_saturate(v, Int8(50)) === Vec{4, Int8}(Int8.((127, -50, 70, 30)))
+        @test SIMD.sub_saturate(v, Int8(50)) === Vec{4, Int8}(Int8.((50, -128, -30, -70)))
+
+    end
+
     @testset "Floating point arithmetic functions" begin
 
         global const v4f64b = map(x->Float64(x+1), v4f64)
@@ -632,6 +642,29 @@ llvm_ir(f, args) = sprint(code_llvm, f, Base.typesof(args...))
             @test occursin(" fadd <4 x double>", ir)
             # @test occursin(r"( shufflevector <4 x double>.*){2}"s, ir)
         end
+
+        function isascii_simd(s::String)
+            len = sizeof(s)
+            nwords = len >> 7
+            _0x80 = Vec{32, UInt8}(0x80)
+            p = pointer(s)
+            i = 0
+            GC.@preserve s for _ in 1:nwords
+                comp = Vec{32, UInt8}(0x00)
+                for _ in 1:4
+                    v = SIMD.vload(LVec{32, UInt8}, p + i)
+                    comp_i = v & _0x80
+                    comp += comp_i
+                    i += 32
+                end
+                reduce(|, comp) == 0x00 || return false
+            end
+            #' Finish up the chunks
+            for i = nwords*32*4+1:len
+                @inbounds(codeunit(s, i)) >= 0x80 && return false
+            end
+            return true
+        end
     end
 
     @testset "Vector shuffles" begin