[interpreter] Implement SIMD extended multiply instructions (WebAssem…

…bly#438) These were accepted into the proposal in WebAssembly#376. There are 12 instructions in total: - i16x8.extmul_{low,high}_i8x16_{s,u} - i32x4.extmul_{low,high}_i16x8_{s,u} - i64x2.extmul_{low,high}_i32x4_{s,u} The implementation is straightforward, widen (using existing operations), then a multiply with the wider shape. The binary opcodes are not decided yet, they currently follow the ones used in V8, when those are finalized, we can change it to match. Added a test generation script that reuses some logic in the generator for arithmetic instructions. Since these instructions have different src and dst shapes, I tweaked the base class to allow for having different shapes.
ngzhian · Feb 3, 2021 · 7c37165 · 7c37165
1 parent 98915d5
commit 7c37165
Show file tree

Hide file tree

Showing 15 changed files with 1,435 additions and 18 deletions.
diff --git a/interpreter/binary/decode.ml b/interpreter/binary/decode.ml
@@ -365,7 +365,11 @@ let simd_prefix s =
   | 0x97l -> i16x8_min_u
   | 0x98l -> i16x8_max_s
   | 0x99l -> i16x8_max_u
+  | 0x9al -> i16x8_extmul_low_i8x16_s
   | 0x9bl -> i16x8_avgr_u
+  | 0x9dl -> i16x8_extmul_high_i8x16_s
+  | 0x9el -> i16x8_extmul_low_i8x16_u
+  | 0x9fl -> i16x8_extmul_high_i8x16_u
   | 0xa0l -> i32x4_abs
   | 0xa1l -> i32x4_neg
   | 0xa3l -> i32x4_all_true
@@ -385,6 +389,10 @@ let simd_prefix s =
   | 0xb8l -> i32x4_max_s
   | 0xb9l -> i32x4_max_u
   | 0xbal -> i32x4_dot_i16x8_s
+  | 0xbbl -> i32x4_extmul_low_i16x8_s
+  | 0xbdl -> i32x4_extmul_high_i16x8_s
+  | 0xbel -> i32x4_extmul_low_i16x8_u
+  | 0xbfl -> i32x4_extmul_high_i16x8_u
   | 0xc0l -> i64x2_eq
   | 0xc1l -> i64x2_neg
   | 0xcbl -> i64x2_shl
@@ -393,6 +401,10 @@ let simd_prefix s =
   | 0xcel -> i64x2_add
   | 0xd0l -> i64x2_ne
   | 0xd1l -> i64x2_sub
+  | 0xd2l -> i64x2_extmul_low_i32x4_s
+  | 0xd3l -> i64x2_extmul_high_i32x4_s
+  | 0xd6l -> i64x2_extmul_low_i32x4_u
+  | 0xd7l -> i64x2_extmul_high_i32x4_u
   | 0xd5l -> i64x2_mul
   | 0xd8l -> f32x4_ceil
   | 0xd9l -> f32x4_floor

diff --git a/interpreter/binary/encode.ml b/interpreter/binary/encode.ml
@@ -467,6 +467,10 @@ let encode m =
       | Binary (V128 V128Op.(I16x8 MaxS)) -> simd_op 0x98l
       | Binary (V128 V128Op.(I16x8 MaxU)) -> simd_op 0x99l
       | Binary (V128 V128Op.(I16x8 AvgrU)) -> simd_op 0x9bl
+      | Binary (V128 V128Op.(I16x8 ExtMulLowS)) -> simd_op 0x9al
+      | Binary (V128 V128Op.(I16x8 ExtMulHighS)) -> simd_op 0x9dl
+      | Binary (V128 V128Op.(I16x8 ExtMulLowU)) -> simd_op 0x9el
+      | Binary (V128 V128Op.(I16x8 ExtMulHighU)) -> simd_op 0x9fl
       | Binary (V128 V128Op.(I32x4 Add)) -> simd_op 0xael
       | Binary (V128 V128Op.(I32x4 Sub)) -> simd_op 0xb1l
       | Binary (V128 V128Op.(I32x4 MinS)) -> simd_op 0xb6l
@@ -485,11 +489,19 @@ let encode m =
       | Binary (V128 V128Op.(I32x4 LeU)) -> simd_op 0x3el
       | Binary (V128 V128Op.(I32x4 GeS)) -> simd_op 0x3fl
       | Binary (V128 V128Op.(I32x4 GeU)) -> simd_op 0x40l
+      | Binary (V128 V128Op.(I32x4 ExtMulLowS)) -> simd_op 0xbbl
+      | Binary (V128 V128Op.(I32x4 ExtMulHighS)) -> simd_op 0xbdl
+      | Binary (V128 V128Op.(I32x4 ExtMulLowU)) -> simd_op 0xbel
+      | Binary (V128 V128Op.(I32x4 ExtMulHighU)) -> simd_op 0xbfl
       | Binary (V128 V128Op.(I64x2 Add)) -> simd_op 0xcel
       | Binary (V128 V128Op.(I64x2 Sub)) -> simd_op 0xd1l
       | Binary (V128 V128Op.(I64x2 Mul)) -> simd_op 0xd5l
       | Binary (V128 V128Op.(I64x2 Eq)) -> simd_op 0xc0l
       | Binary (V128 V128Op.(I64x2 Ne)) -> simd_op 0xd0l
+      | Binary (V128 V128Op.(I64x2 ExtMulLowS)) -> simd_op 0xd2l
+      | Binary (V128 V128Op.(I64x2 ExtMulHighS)) -> simd_op 0xd3l
+      | Binary (V128 V128Op.(I64x2 ExtMulLowU)) -> simd_op 0xd6l
+      | Binary (V128 V128Op.(I64x2 ExtMulHighU)) -> simd_op 0xd7l
       | Binary (V128 V128Op.(F32x4 Eq)) -> simd_op 0x41l
       | Binary (V128 V128Op.(F32x4 Ne)) -> simd_op 0x42l
       | Binary (V128 V128Op.(F32x4 Lt)) -> simd_op 0x43l

diff --git a/interpreter/exec/eval_simd.ml b/interpreter/exec/eval_simd.ml
@@ -101,6 +101,10 @@ module SimdOp (SXX : Simd.S) (Value : ValueType with type t = SXX.t) = struct
       | I16x8 MaxS -> SXX.I16x8.max_s
       | I16x8 MaxU -> SXX.I16x8.max_u
       | I16x8 AvgrU -> SXX.I16x8.avgr_u
+      | I16x8 ExtMulLowS -> SXX.I16x8_convert.extmul_low_s
+      | I16x8 ExtMulHighS -> SXX.I16x8_convert.extmul_high_s
+      | I16x8 ExtMulLowU -> SXX.I16x8_convert.extmul_low_u
+      | I16x8 ExtMulHighU -> SXX.I16x8_convert.extmul_high_u
       | I32x4 Add -> SXX.I32x4.add
       | I32x4 Sub -> SXX.I32x4.sub
       | I32x4 MinS -> SXX.I32x4.min_s
@@ -121,9 +125,17 @@ module SimdOp (SXX : Simd.S) (Value : ValueType with type t = SXX.t) = struct
       | I32x4 DotI16x8S -> SXX.I32x4_convert.dot_i16x8_s
       | I64x2 Eq -> SXX.I64x2.eq
       | I64x2 Ne -> SXX.I64x2.ne
+      | I32x4 ExtMulLowS -> SXX.I32x4_convert.extmul_low_s
+      | I32x4 ExtMulHighS -> SXX.I32x4_convert.extmul_high_s
+      | I32x4 ExtMulLowU -> SXX.I32x4_convert.extmul_low_u
+      | I32x4 ExtMulHighU -> SXX.I32x4_convert.extmul_high_u
       | I64x2 Add -> SXX.I64x2.add
       | I64x2 Sub -> SXX.I64x2.sub
       | I64x2 Mul -> SXX.I64x2.mul
+      | I64x2 ExtMulLowS -> SXX.I64x2_convert.extmul_low_s
+      | I64x2 ExtMulHighS -> SXX.I64x2_convert.extmul_high_s
+      | I64x2 ExtMulLowU -> SXX.I64x2_convert.extmul_low_u
+      | I64x2 ExtMulHighU -> SXX.I64x2_convert.extmul_high_u
       | F32x4 Eq -> SXX.F32x4.eq
       | F32x4 Ne -> SXX.F32x4.ne
       | F32x4 Lt -> SXX.F32x4.lt

diff --git a/interpreter/exec/simd.ml b/interpreter/exec/simd.ml
@@ -177,6 +177,10 @@ sig
     val widen_high_s : t -> t
     val widen_low_u : t -> t
     val widen_high_u : t -> t
+    val extmul_low_s : t -> t -> t
+    val extmul_high_s : t -> t -> t
+    val extmul_low_u : t -> t -> t
+    val extmul_high_u : t -> t -> t
   end
   module I32x4_convert : sig
     val trunc_sat_f32x4_s : t -> t
@@ -186,10 +190,20 @@ sig
     val widen_low_u : t -> t
     val widen_high_u : t -> t
     val dot_i16x8_s : t -> t -> t
+    val extmul_low_s : t -> t -> t
+    val extmul_high_s : t -> t -> t
+    val extmul_low_u : t -> t -> t
+    val extmul_high_u : t -> t -> t
   end
   module I64x2_convert : sig
     val widen_low_s : t -> t
+    val widen_high_s : t -> t
     val widen_low_u : t -> t
+    val widen_high_u : t -> t
+    val extmul_low_s : t -> t -> t
+    val extmul_high_s : t -> t -> t
+    val extmul_low_u : t -> t -> t
+    val extmul_high_u : t -> t -> t
   end
   module F32x4_convert : sig
     val convert_i32x4_s : t -> t
@@ -417,6 +431,10 @@ struct
     let widen_low_u = widen Lib.List.take 0xffl
     let widen_high_u = widen Lib.List.drop 0xffl
 
+    let extmul_low_s x y = I16x8.mul (widen_low_s x) (widen_low_s y)
+    let extmul_high_s x y = I16x8.mul (widen_high_s x) (widen_high_s y)
+    let extmul_low_u x y = I16x8.mul (widen_low_u x) (widen_low_u y)
+    let extmul_high_u x y = I16x8.mul (widen_high_u x) (widen_high_u y)
   end
 
   module I32x4_convert = struct
@@ -441,16 +459,28 @@ struct
         | [], [] -> []
         | _, _ -> assert false
       in Rep.of_i32x4 (dot xs ys)
+
+    let extmul_low_s x y = I32x4.mul (widen_low_s x) (widen_low_s y)
+    let extmul_high_s x y = I32x4.mul (widen_high_s x) (widen_high_s y)
+    let extmul_low_u x y = I32x4.mul (widen_low_u x) (widen_low_u y)
+    let extmul_high_u x y = I32x4.mul (widen_high_u x) (widen_high_u y)
   end
 
   module I64x2_convert = struct
-    let widen mask x =
+    let widen take_or_drop mask x =
       Rep.of_i64x2
         (List.map
            (fun i32 -> Int64.(logand mask (of_int32 i32)))
-           (Lib.List.take 2 (Rep.to_i32x4 x)))
-    let widen_low_s = widen 0xffffffffffffffffL
-    let widen_low_u = widen 0xffffffffL
+           (take_or_drop 2 (Rep.to_i32x4 x)))
+    let widen_low_s = widen Lib.List.take 0xffffffffffffffffL
+    let widen_high_s = widen Lib.List.drop 0xffffffffffffffffL
+    let widen_low_u = widen Lib.List.take 0xffffffffL
+    let widen_high_u = widen Lib.List.drop 0xffffffffL
+
+    let extmul_low_s x y = I64x2.mul (widen_low_s x) (widen_low_s y)
+    let extmul_high_s x y = I64x2.mul (widen_high_s x) (widen_high_s y)
+    let extmul_low_u x y = I64x2.mul (widen_low_u x) (widen_low_u y)
+    let extmul_high_u x y = I64x2.mul (widen_high_u x) (widen_high_u y)
   end
 
   module F32x4_convert = struct

diff --git a/interpreter/syntax/ast.ml b/interpreter/syntax/ast.ml
@@ -55,6 +55,7 @@ struct
               | Swizzle | Shuffle of int list | NarrowS | NarrowU
               | AddSatS | AddSatU | SubSatS | SubSatU
               | DotI16x8S
+              | ExtMulLowS | ExtMulHighS | ExtMulLowU | ExtMulHighU
   type funop = Abs | Neg | Sqrt
              | Ceil | Floor | Trunc | Nearest
              | ConvertI32x4S | ConvertI32x4U

diff --git a/interpreter/syntax/operators.ml b/interpreter/syntax/operators.ml
@@ -340,6 +340,10 @@ let i16x8_min_u = Binary (V128 V128Op.(I16x8 MinU))
 let i16x8_max_s = Binary (V128 V128Op.(I16x8 MaxS))
 let i16x8_max_u = Binary (V128 V128Op.(I16x8 MaxU))
 let i16x8_avgr_u = Binary (V128 V128Op.(I16x8 AvgrU))
+let i16x8_extmul_low_i8x16_s = Binary (V128 V128Op.(I16x8 ExtMulLowS))
+let i16x8_extmul_high_i8x16_s = Binary (V128 V128Op.(I16x8 ExtMulHighS))
+let i16x8_extmul_low_i8x16_u = Binary (V128 V128Op.(I16x8 ExtMulLowU))
+let i16x8_extmul_high_i8x16_u = Binary (V128 V128Op.(I16x8 ExtMulHighU))
 
 let i32x4_splat = Convert (V128 V128Op.(I32x4 Splat))
 let i32x4_extract_lane imm = SimdExtract (V128Op.I32x4 (ZX, imm))
@@ -375,6 +379,10 @@ let i32x4_mul = Binary (V128 V128Op.(I32x4 Mul))
 let i32x4_trunc_sat_f32x4_s = Unary (V128 V128Op.(I32x4 TruncSatF32x4S))
 let i32x4_trunc_sat_f32x4_u = Unary (V128 V128Op.(I32x4 TruncSatF32x4U))
 let i32x4_dot_i16x8_s = Binary (V128 V128Op.(I32x4 DotI16x8S))
+let i32x4_extmul_low_i16x8_s = Binary (V128 V128Op.(I32x4 ExtMulLowS))
+let i32x4_extmul_high_i16x8_s = Binary (V128 V128Op.(I32x4 ExtMulHighS))
+let i32x4_extmul_low_i16x8_u = Binary (V128 V128Op.(I32x4 ExtMulLowU))
+let i32x4_extmul_high_i16x8_u = Binary (V128 V128Op.(I32x4 ExtMulHighU))
 
 let i64x2_splat = Convert (V128 V128Op.(I64x2 Splat))
 let i64x2_extract_lane imm = SimdExtract (V128Op.I64x2 (ZX, imm))
@@ -388,6 +396,10 @@ let i64x2_mul = Binary (V128 V128Op.(I64x2 Mul))
 let i64x2_shl = SimdShift V128Op.(I64x2 Shl)
 let i64x2_shr_s = SimdShift V128Op.(I64x2 ShrS)
 let i64x2_shr_u = SimdShift V128Op.(I64x2 ShrU)
+let i64x2_extmul_low_i32x4_s = Binary (V128 V128Op.(I64x2 ExtMulLowS))
+let i64x2_extmul_high_i32x4_s = Binary (V128 V128Op.(I64x2 ExtMulHighS))
+let i64x2_extmul_low_i32x4_u = Binary (V128 V128Op.(I64x2 ExtMulLowU))
+let i64x2_extmul_high_i32x4_u = Binary (V128 V128Op.(I64x2 ExtMulHighU))
 
 let f32x4_splat = Convert (V128 V128Op.(F32x4 Splat))
 let f32x4_extract_lane imm = SimdExtract (V128Op.F32x4 (ZX, imm))

diff --git a/interpreter/text/arrange.ml b/interpreter/text/arrange.ml
@@ -298,6 +298,10 @@ struct
     | I16x8 MaxS -> "i16x8.max_s"
     | I16x8 MaxU -> "i16x8.max_u"
     | I16x8 AvgrU -> "i16x8.avgr_u"
+    | I16x8 ExtMulLowS -> "i16x8.extmul_low_i8x16_s"
+    | I16x8 ExtMulHighS -> "i16x8.extmul_high_i8x16_s"
+    | I16x8 ExtMulLowU -> "i16x8.extmul_low_i8x16_u"
+    | I16x8 ExtMulHighU -> "i16x8.extmul_high_i8x16_u"
     | I32x4 Add -> "i32x4.add"
     | I32x4 Sub -> "i32x4.sub"
     | I32x4 Mul -> "i32x4.mul"
@@ -306,9 +310,17 @@ struct
     | I32x4 MaxS -> "i32x4.max_s"
     | I32x4 MaxU -> "i32x4.max_u"
     | I32x4 DotI16x8S -> "i32x4.dot_i16x8_s"
+    | I32x4 ExtMulLowS -> "i32x4.extmul_low_i16x8_s"
+    | I32x4 ExtMulHighS -> "i32x4.extmul_high_i16x8_s"
+    | I32x4 ExtMulLowU -> "i32x4.extmul_low_i16x8_u"
+    | I32x4 ExtMulHighU -> "i32x4.extmul_high_i16x8_u"
     | I64x2 Add -> "i64x2.add"
     | I64x2 Sub -> "i64x2.sub"
     | I64x2 Mul -> "i64x2.mul"
+    | I64x2 ExtMulLowS -> "i64x2.extmul_low_i32x4_s"
+    | I64x2 ExtMulHighS -> "i64x2.extmul_high_i32x4_s"
+    | I64x2 ExtMulLowU -> "i64x2.extmul_low_i32x4_u"
+    | I64x2 ExtMulHighU -> "i64x2.extmul_high_i32x4_u"
     | F32x4 Eq -> "f32x4.eq"
     | F32x4 Ne -> "f32x4.ne"
     | F32x4 Lt -> "f32x4.lt"

diff --git a/interpreter/text/lexer.mll b/interpreter/text/lexer.mll
@@ -576,6 +576,19 @@ rule token = parse
   | "i32x4.dot_i16x8_s"
   { BINARY i32x4_dot_i16x8_s }
 
+  | "i16x8.extmul_low_i8x16_"(sign as s)
+    { BINARY (ext s i16x8_extmul_low_i8x16_s i16x8_extmul_low_i8x16_u) }
+  | "i16x8.extmul_high_i8x16_"(sign as s)
+    { BINARY (ext s i16x8_extmul_high_i8x16_s i16x8_extmul_high_i8x16_u) }
+  | "i32x4.extmul_low_i16x8_"(sign as s)
+    { BINARY (ext s i32x4_extmul_low_i16x8_s i32x4_extmul_low_i16x8_u) }
+  | "i32x4.extmul_high_i16x8_"(sign as s)
+    { BINARY (ext s i32x4_extmul_high_i16x8_s i32x4_extmul_high_i16x8_u) }
+  | "i64x2.extmul_low_i32x4_"(sign as s)
+    { BINARY (ext s i64x2_extmul_low_i32x4_s i64x2_extmul_low_i32x4_u) }
+  | "i64x2.extmul_high_i32x4_"(sign as s)
+    { BINARY (ext s i64x2_extmul_high_i32x4_s i64x2_extmul_high_i32x4_u) }
+
   | (simd_shape as s) { SIMD_SHAPE (simd_shape s) }
 
   | name as s { VAR s }

diff --git a/test/core/simd/meta/gen_tests.py b/test/core/simd/meta/gen_tests.py
@@ -33,6 +33,7 @@
     'simd_f64x2_pmin_pmax',
     'simd_i32x4_dot_i16x8',
     'simd_load_lane',
+    'simd_ext_mul',
 )
 
 

diff --git a/test/core/simd/meta/simd_arithmetic.py b/test/core/simd/meta/simd_arithmetic.py
@@ -35,14 +35,27 @@ def __str__(self):
     def lane(self):
         return self.LANE_VALUE.get(self.LANE_TYPE)
 
+    @property
+    def dst_lane(self):
+        return self.lane
+
+    @property
+    def src_lane(self):
+        # Used for arithmetic that extends the lane, e.g. i16x8 lanes, which
+        # are extended multiply to i32x4.
+        if hasattr(self, 'SRC_LANE_TYPE'):
+            return self.LANE_VALUE.get(self.SRC_LANE_TYPE)
+        else:
+            return self.lane
+
     @property
     def normal_unary_op_test_data(self):
-        lane = self.lane
+        lane = self.src_lane
         return [0, 1, -1, lane.max - 1, lane.min + 1, lane.min, lane.max, lane.mask]
 
     @property
     def normal_binary_op_test_data(self):
-        lane = self.lane
+        lane = self.src_lane
         return [
             (0, 0),
             (0, 1),
@@ -170,7 +183,7 @@ def get_case_data(self):
             for data_group, v128_forms in self.bin_test_data:
                 for data in data_group:
                     case_data.append([op_name, [str(data[0]), str(data[1])],
-                                      str(o.binary_op(data[0], data[1], self.lane)),
+                                      str(o.binary_op(data[0], data[1], self.src_lane, self.dst_lane)),
                                      v128_forms])
             for data_group in self.full_bin_test_data:
                 for data in data_group.get(op_name):
@@ -183,7 +196,7 @@ def get_case_data(self):
             for data_group, v128_forms in self.unary_test_data:
                 for data in data_group:
                     case_data.append([op_name, [str(data)],
-                                      str(o.unary_op(data, self.lane)),
+                                      str(o.unary_op(data, self.dst_lane)),
                                       v128_forms])
 
         return case_data

diff --git a/test/core/simd/meta/simd_ext_mul.py b/test/core/simd/meta/simd_ext_mul.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+
+""" Base class for generating extended multiply instructions.  These
+instructions 2 inputs of the same (narrower) lane shape, multiplies
+corresponding lanes with extension (no overflow/wraparound), producing 1 output
+of a (wider) shape. These instructions can choose to work on the low or high
+halves of the inputs, and perform signed or unsigned multiply.
+
+Subclasses need to define 3 attributes:
+  - LANE_TYPE (this is the output shape)
+  - SRC_LANE_TYPE (this is the input (narrower) shape)
+  - BINARY_OPS (list of operations)
+"""
+
+from simd_arithmetic import SimdArithmeticCase
+
+
+class SimdExtMulCase(SimdArithmeticCase):
+    UNARY_OPS = ()
+
+    @property
+    def full_bin_test_data(self):
+        return []
+
+    def get_combine_cases(self):
+        return ''
+
+    @property
+    def bin_test_data(self):
+        lane_forms = [self.SRC_LANE_TYPE, self.SRC_LANE_TYPE, self.LANE_TYPE]
+        return [(self.normal_binary_op_test_data, lane_forms)]
+
+    @property
+    def hex_binary_op_test_data(self):
+        return []
+
+    def gen_test_cases(self):
+        wast_filename = '../simd_{wide}_extmul_{narrow}.wast'.format(
+                wide=self.LANE_TYPE, narrow=self.SRC_LANE_TYPE)
+        with open(wast_filename, 'w') as fp:
+            fp.write(self.get_all_cases())
+
+
+class SimdI16x8ExtMulCase(SimdExtMulCase):
+    LANE_TYPE = 'i16x8'
+    SRC_LANE_TYPE = 'i8x16'
+    BINARY_OPS = ('extmul_low_i8x16_s', 'extmul_high_i8x16_s',
+                  'extmul_low_i8x16_u', 'extmul_high_i8x16_u')
+
+
+class SimdI32x4ExtMulCase(SimdExtMulCase):
+    LANE_TYPE = 'i32x4'
+    SRC_LANE_TYPE = 'i16x8'
+    BINARY_OPS = ('extmul_low_i16x8_s', 'extmul_high_i16x8_s',
+                  'extmul_low_i16x8_u', 'extmul_high_i16x8_u')
+
+
+class SimdI64x2ExtMulCase(SimdExtMulCase):
+    LANE_TYPE = 'i64x2'
+    SRC_LANE_TYPE = 'i32x4'
+    BINARY_OPS = ('extmul_low_i32x4_s', 'extmul_high_i32x4_s',
+                  'extmul_low_i32x4_u', 'extmul_high_i32x4_u')
+
+
+def gen_test_cases():
+    simd_i16x8_ext_mul_case = SimdI16x8ExtMulCase()
+    simd_i16x8_ext_mul_case.gen_test_cases()
+    simd_i32x4_ext_mul_case = SimdI32x4ExtMulCase()
+    simd_i32x4_ext_mul_case.gen_test_cases()
+    simd_i64x2_ext_mul_case = SimdI64x2ExtMulCase()
+    simd_i64x2_ext_mul_case.gen_test_cases()
+
+
+if __name__ == '__main__':
+    gen_test_cases()