mratsim · mratsim · Jan 16, 2019 · Jan 15, 2019 · Jan 16, 2019 · Jan 16, 2019
diff --git a/benchmarks/blas.nim b/benchmarks/blas.nim
@@ -1,6 +1,8 @@
 when defined(osx):
   const blas = "libopenblas.dylib"
   {.passC: "-I'/usr/local/opt/openblas/include' -L'/usr/local/opt/openblas/lib'".}
+elif defined(linux):
+  const blas = "libcblas.so"
 else:
   {.fatal: "OpenBLAS not configured for this platform".}
   # When adding new platform, you also need to update nim.cfg

diff --git a/benchmarks/gemm/gemm_bench_float32.nim b/benchmarks/gemm/gemm_bench_float32.nim
@@ -50,13 +50,13 @@ import
   ../../laser/primitives/matrix_multiplication/gemm
 
 const
-  M     = 16*6*20
-  K     = 16*6*20
-  N     = 16*6*20
+  M     = 32*6*20
+  K     = 32*6*20
+  N     = 32*6*20
   NbSamples = 10    # This might stresss the allocator when packing if the matrices are big
-  CpuGhz = 2.7      # Assuming no turbo
-  NumCpuCores = 2
-  CpuFlopCycle = 32 # AVX2: 2xFMA/cycle = 2x8x2 - 2 x 8 floats x (1 add + 1 mul)
+  CpuGhz = 3.6      # i9-9980XE OC All turbo 4.1GHz (AVX2 4.0GHz, AVX512 3.6GHz)
+  NumCpuCores = 18
+  CpuFlopCycle = 64 # AVX2: 2xFMA/cycle = 2x8x2 - 2 x 8 floats x (1 add + 1 mul)
 
 const
   ashape: MatrixShape = (M, K)
@@ -244,6 +244,56 @@ when isMainModule:
 ###############################
 # OpenMP
 
+# i9_9980XE Skylake-X 18 cores overclocked 4.1 GHz all-turbo, 4.0 GHz AVX turbo, 3.6 GHz AVX512 turbo
+# PyTorch Glow compiled with AVX2 as AVX512 is slower
+# Warmup: 0.9018 s, result 224 (displayed to avoid compiler optimizing warmup away)
+
+# A matrix shape: (M: 3840, N: 3840)
+# B matrix shape: (M: 3840, N: 3840)
+# Output shape: (M: 3840, N: 3840)
+# Required number of operations: 113246.208 millions
+# Required bytes:                  117.965 MB
+# Arithmetic intensity:            960.000 FLOP/byte
+# Theoretical peak single-core:    230.400 GFLOP/s
+# Theoretical peak multi:         4147.200 GFLOP/s
+# Make sure to not bench Apple Accelerate or the default Linux BLAS.
+
+# OpenBLAS benchmark
+# Collected 10 samples in 0.504 seconds
+# Average time: 49.841 ms
+# Stddev  time: 4.290 ms
+# Min     time: 48.066 ms
+# Max     time: 61.994 ms
+# Perf:         2272.149 GFLOP/s
+
+# Display output[0] to make sure it's not optimized away
+# 950.1965942382812
+
+# Laser production implementation
+# Collected 10 samples in 0.653 seconds
+# Average time: 64.678 ms
+# Stddev  time: 2.742 ms
+# Min     time: 63.140 ms
+# Max     time: 71.649 ms
+# Perf:         1750.928 GFLOP/s
+
+# Display output[0] to make sure it's not optimized away
+# 950.1968383789062
+
+# PyTorch Glow: libjit matmul implementation
+# Collected 10 samples in 16.555 seconds
+# Average time: 1655.510 ms
+# Stddev  time: 0.204 ms
+# Min     time: 1655.276 ms
+# Max     time: 1655.983 ms
+# Perf:         68.406 GFLOP/s
+
+# Display output[0] to make sure it's not optimized away
+# 950.1965942382812
+
+###############################
+# i5-5227U 2.7 GHz Broadwell dual core AVX2
+
 # $  ./build/bench_gemm
 # Warmup: 1.1900 s, result 224 (displayed to avoid compiler optimizing warmup away)
 
@@ -281,6 +331,58 @@ when isMainModule:
 
 ###############################
 # Serial - Nim code compiled without -d:openmp
+# i9_9980XE Skylake-X 18 cores overclocked 4.1 GHz all-turbo, 4.0 GHz AVX turbo, 3.6 GHz AVX512 turbo
+# PyTorch Glow compiled with AVX2 as AVX512 is slower
+# For some reason OPENBLAS_NUM_THREADS=1 is ignore on Linux ...
+
+# # $ OPENBLAS_NUM_THREADS=1 ./build/bench_gemm
+# Warmup: 0.9034 s, result 224 (displayed to avoid compiler optimizing warmup away)
+
+# A matrix shape: (M: 3840, N: 3840)
+# B matrix shape: (M: 3840, N: 3840)
+# Output shape: (M: 3840, N: 3840)
+# Required number of operations: 113246.208 millions
+# Required bytes:                  117.965 MB
+# Arithmetic intensity:            960.000 FLOP/byte
+# Theoretical peak single-core:    230.400 GFLOP/s
+# Theoretical peak multi:         4147.200 GFLOP/s
+# Make sure to not bench Apple Accelerate or the default Linux BLAS.
+
+# OpenBLAS benchmark
+# Collected 10 samples in 0.499 seconds
+# Average time: 49.279 ms
+# Stddev  time: 3.924 ms
+# Min     time: 47.855 ms
+# Max     time: 60.436 ms
+# Perf:         2298.061 GFLOP/s
+
+# Display output[0] to make sure it's not optimized away
+# 950.1965942382812
+
+# Laser production implementation
+# Collected 10 samples in 6.828 seconds
+# Average time: 682.218 ms
+# Stddev  time: 9.549 ms
+# Min     time: 667.896 ms
+# Max     time: 693.479 ms
+# Perf:         165.997 GFLOP/s
+
+# Display output[0] to make sure it's not optimized away
+# 950.1968383789062
+
+# PyTorch Glow: libjit matmul implementation
+# Collected 10 samples in 17.060 seconds
+# Average time: 1705.967 ms
+# Stddev  time: 0.332 ms
+# Min     time: 1705.659 ms
+# Max     time: 1706.847 ms
+# Perf:         66.382 GFLOP/s
+
+# Display output[0] to make sure it's not optimized away
+# 950.1965942382812
+
+###############################
+# i5-5227U 2.7 GHz Broadwell dual core AVX2
 
 # $ OPENBLAS_NUM_THREADS=1 ./build/bench_gemm
 # Warmup: 1.1973 s, result 224 (displayed to avoid compiler optimizing warmup away)

diff --git a/laser/cpuinfo.nim b/laser/cpuinfo.nim
@@ -282,6 +282,10 @@ proc cpuinfo_get_current_core*(): ptr CPUInfo_core {.cpuinfo_proc.}
 # Otherwise "curSrcFolder" is ignored
 {.passC: "-I" & cpuinfoPath & "src -I" & curSrcFolder & DirSep & "third_party".}
 
+when defined(linux):
+  {.passC: "-D_GNU_SOURCE".}
+  {.passL: "-lpthread".}
+
 template compile(path: static string): untyped =
   # Path: the path from cpuinfo/src folder
   const compiled_object = block:

diff --git a/laser/primitives/matrix_multiplication/gemm.nim b/laser/primitives/matrix_multiplication/gemm.nim
@@ -198,11 +198,13 @@ proc gemm_strided*[T: SomeNumber](
 
     when defined(i386) or defined(amd64):
       when T is float32:
-        if cpuinfo_has_x86_fma3():   dispatch(x86_AVX_FMA)
+        if cpuinfo_has_x86_avx512f():   dispatch(x86_AVX512)
+        elif cpuinfo_has_x86_fma3():   dispatch(x86_AVX_FMA)
         elif cpuinfo_has_x86_avx():  dispatch(x86_AVX)
         elif cpuinfo_has_x86_sse():    dispatch(x86_SSE)
       elif T is float64:
-        if cpuinfo_has_x86_fma3():   dispatch(x86_AVX_FMA)
+        if cpuinfo_has_x86_avx512f():   dispatch(x86_AVX512)
+        elif cpuinfo_has_x86_fma3():   dispatch(x86_AVX_FMA)
         elif cpuinfo_has_x86_avx():  dispatch(x86_AVX)
         elif cpuinfo_has_x86_sse2():    dispatch(x86_SSE2)
       elif T is int32 or T is uint32:

diff --git a/laser/primitives/matrix_multiplication/gemm_tiling.nim b/laser/primitives/matrix_multiplication/gemm_tiling.nim
@@ -77,22 +77,23 @@ type
     x86_SSE4_1,
     x86_AVX,
     x86_AVX_FMA,
-    x86_AVX2
-    # x86_AVX512 # TODO
+    x86_AVX2,
+    x86_AVX512
     #   Note that Skylake SP, Xeon Bronze Silver and Gold 5XXX
-    #   only have a single AVX512 port and AVX2 can be faster.
+    #   only have a single AVX512 port and AVX2 can be faster
+    #   due to AVX512 downclocking
 
   X86_FeatureMap = array[CPUFeatureX86, int]
 
 const X86_vecsize_float: X86_FeatureMap = [
   x86_Generic:         1,
   x86_SSE:     128 div 8,
   x86_SSE2:    128 div 8,
-  x86_SSE4_1:   128 div 8,
+  x86_SSE4_1:  128 div 8,
   x86_AVX:     256 div 8,
   x86_AVX_FMA: 256 div 8,
-  x86_AVX2:    256 div 8
-  # x86_AVX512:  512 div 8
+  x86_AVX2:    256 div 8,
+  x86_AVX512:  512 div 8
 ]
 
 const X86_vecsize_int: X86_FeatureMap = [
@@ -103,7 +104,7 @@ const X86_vecsize_int: X86_FeatureMap = [
   x86_AVX:     128 div 8,  # Not even addition with integer AVX
   x86_AVX_FMA: 128 div 8,
   x86_AVX2:    256 div 8,
-  # x86_AVX512:  512 div 8
+  x86_AVX512:  512 div 8
 ]
 
 # mr * nr < number of registers - 4
@@ -115,8 +116,8 @@ const X86_regs: X86_FeatureMap = [
   x86_SSE4_1:  6,
   x86_AVX:     6, # 16 YMM registers
   x86_AVX_FMA: 6,
-  x86_AVX2:    6
-  # x86_AVX512:  6  # 32 ZMM registers
+  x86_AVX2:    6,
+  x86_AVX512: 14  # 32 ZMM registers
 ]
 
 func x86_ukernel*(cpu: CPUFeatureX86, T: typedesc, c_unit_stride: bool): MicroKernel =

diff --git a/laser/primitives/matrix_multiplication/gemm_ukernel_avx512.nim b/laser/primitives/matrix_multiplication/gemm_ukernel_avx512.nim
@@ -0,0 +1,74 @@
+# Laser
+# Copyright (c) 2018 Mamy André-Ratsimbazafy
+# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
+# This file may not be copied, modified, or distributed except according to those terms.
+
+import
+    ./gemm_ukernel_generator, ./gemm_tiling,
+    ../../simd
+
+ukernel_generator(
+    x86_AVX512,
+    typ = float32,
+    vectype = m512,
+    nb_scalars = 16,
+    simd_setZero = mm512_setzero_ps,
+    simd_broadcast_value = mm512_set1_ps,
+    simd_load_aligned = mm512_load_ps,
+    simd_load_unaligned = mm512_loadu_ps,
+    simd_store_unaligned = mm512_storeu_ps,
+    simd_mul = mm512_mul_ps,
+    simd_add = mm512_add_ps,
+    simd_fma = mm512_fmadd_ps
+  )
+
+ukernel_generator(
+    x86_AVX512,
+    typ = float64,
+    vectype = m512d,
+    nb_scalars = 8,
+    simd_setZero = mm512_setzero_pd,
+    simd_broadcast_value = mm512_set1_pd,
+    simd_load_aligned = mm512_load_pd,
+    simd_load_unaligned = mm512_loadu_pd,
+    simd_store_unaligned = mm512_storeu_pd,
+    simd_mul = mm512_mul_pd,
+    simd_add = mm512_add_pd,
+    simd_fma = mm512_fmadd_pd
+  )
+
+template int32x16_muladd_unfused_avx512(a, b, c: m512i): m512i =
+  mm512_add_epi32(mm512_mullo_epi32(a, b), c)
+
+ukernel_generator(
+    x86_AVX512,
+    typ = int32,
+    vectype = m512i,
+    nb_scalars = 16,
+    simd_setZero = mm512_setzero_si512,
+    simd_broadcast_value = mm512_set1_epi32,
+    simd_load_aligned = mm512_load_si512,
+    simd_load_unaligned = mm512_loadu_si512,
+    simd_store_unaligned = mm512_storeu_si512,
+    simd_mul = mm512_mullo_epi32,
+    simd_add = mm512_add_epi32,
+    simd_fma = int32x16_muladd_unfused_avx512
+    )
+
+template int64x8_muladd_unfused_avx512(a, b, c: m512i): m512i =
+  mm512_add_epi64(mm512_mullo_epi64(a, b), c)
+
+ukernel_generator(
+    x86_AVX512,
+    typ = int64,
+    vectype = m512i,
+    nb_scalars = 8,
+    simd_setZero = mm512_setzero_si512,
+    simd_broadcast_value = mm512_set1_epi64,
+    simd_load_aligned = mm512_load_si512,
+    simd_load_unaligned = mm512_loadu_si512,
+    simd_store_unaligned = mm512_storeu_si512,
+    simd_mul = mm512_mullo_epi64,
+    simd_add = mm512_add_epi64,
+    simd_fma = int64x8_muladd_unfused_avx512
+    )
diff --git a/laser/primitives/matrix_multiplication/gemm_ukernel_dispatch.nim b/laser/primitives/matrix_multiplication/gemm_ukernel_dispatch.nim
@@ -13,7 +13,8 @@ import
   ./gemm_ukernel_sse4_1,
   ./gemm_ukernel_avx,
   ./gemm_ukernel_avx_fma,
-  ./gemm_ukernel_avx2
+  ./gemm_ukernel_avx2,
+  ./gemm_ukernel_avx512
 
 {.experimental: "dynamicBindSym".}