Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AVX512 GEMM kernel #14

Merged
merged 4 commits into from
Jan 16, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions benchmarks/blas.nim
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
when defined(osx):
const blas = "libopenblas.dylib"
{.passC: "-I'/usr/local/opt/openblas/include' -L'/usr/local/opt/openblas/lib'".}
elif defined(linux):
const blas = "libcblas.so"
else:
{.fatal: "OpenBLAS not configured for this platform".}
# When adding new platform, you also need to update nim.cfg
Expand Down
114 changes: 108 additions & 6 deletions benchmarks/gemm/gemm_bench_float32.nim
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,13 @@ import
../../laser/primitives/matrix_multiplication/gemm

const
M = 16*6*20
K = 16*6*20
N = 16*6*20
M = 32*6*20
K = 32*6*20
N = 32*6*20
NbSamples = 10 # This might stresss the allocator when packing if the matrices are big
CpuGhz = 2.7 # Assuming no turbo
NumCpuCores = 2
CpuFlopCycle = 32 # AVX2: 2xFMA/cycle = 2x8x2 - 2 x 8 floats x (1 add + 1 mul)
CpuGhz = 3.6 # i9-9980XE OC All turbo 4.1GHz (AVX2 4.0GHz, AVX512 3.6GHz)
NumCpuCores = 18
CpuFlopCycle = 64 # AVX2: 2xFMA/cycle = 2x8x2 - 2 x 8 floats x (1 add + 1 mul)

const
ashape: MatrixShape = (M, K)
Expand Down Expand Up @@ -244,6 +244,56 @@ when isMainModule:
###############################
# OpenMP

# i9_9980XE Skylake-X 18 cores overclocked 4.1 GHz all-turbo, 4.0 GHz AVX turbo, 3.6 GHz AVX512 turbo
# PyTorch Glow compiled with AVX2 as AVX512 is slower
# Warmup: 0.9018 s, result 224 (displayed to avoid compiler optimizing warmup away)

# A matrix shape: (M: 3840, N: 3840)
# B matrix shape: (M: 3840, N: 3840)
# Output shape: (M: 3840, N: 3840)
# Required number of operations: 113246.208 millions
# Required bytes: 117.965 MB
# Arithmetic intensity: 960.000 FLOP/byte
# Theoretical peak single-core: 230.400 GFLOP/s
# Theoretical peak multi: 4147.200 GFLOP/s
# Make sure to not bench Apple Accelerate or the default Linux BLAS.

# OpenBLAS benchmark
# Collected 10 samples in 0.504 seconds
# Average time: 49.841 ms
# Stddev time: 4.290 ms
# Min time: 48.066 ms
# Max time: 61.994 ms
# Perf: 2272.149 GFLOP/s

# Display output[0] to make sure it's not optimized away
# 950.1965942382812

# Laser production implementation
# Collected 10 samples in 0.653 seconds
# Average time: 64.678 ms
# Stddev time: 2.742 ms
# Min time: 63.140 ms
# Max time: 71.649 ms
# Perf: 1750.928 GFLOP/s

# Display output[0] to make sure it's not optimized away
# 950.1968383789062

# PyTorch Glow: libjit matmul implementation
# Collected 10 samples in 16.555 seconds
# Average time: 1655.510 ms
# Stddev time: 0.204 ms
# Min time: 1655.276 ms
# Max time: 1655.983 ms
# Perf: 68.406 GFLOP/s

# Display output[0] to make sure it's not optimized away
# 950.1965942382812

###############################
# i5-5227U 2.7 GHz Broadwell dual core AVX2

# $ ./build/bench_gemm
# Warmup: 1.1900 s, result 224 (displayed to avoid compiler optimizing warmup away)

Expand Down Expand Up @@ -281,6 +331,58 @@ when isMainModule:

###############################
# Serial - Nim code compiled without -d:openmp
# i9_9980XE Skylake-X 18 cores overclocked 4.1 GHz all-turbo, 4.0 GHz AVX turbo, 3.6 GHz AVX512 turbo
# PyTorch Glow compiled with AVX2 as AVX512 is slower
# For some reason OPENBLAS_NUM_THREADS=1 is ignore on Linux ...

# # $ OPENBLAS_NUM_THREADS=1 ./build/bench_gemm
# Warmup: 0.9034 s, result 224 (displayed to avoid compiler optimizing warmup away)

# A matrix shape: (M: 3840, N: 3840)
# B matrix shape: (M: 3840, N: 3840)
# Output shape: (M: 3840, N: 3840)
# Required number of operations: 113246.208 millions
# Required bytes: 117.965 MB
# Arithmetic intensity: 960.000 FLOP/byte
# Theoretical peak single-core: 230.400 GFLOP/s
# Theoretical peak multi: 4147.200 GFLOP/s
# Make sure to not bench Apple Accelerate or the default Linux BLAS.

# OpenBLAS benchmark
# Collected 10 samples in 0.499 seconds
# Average time: 49.279 ms
# Stddev time: 3.924 ms
# Min time: 47.855 ms
# Max time: 60.436 ms
# Perf: 2298.061 GFLOP/s

# Display output[0] to make sure it's not optimized away
# 950.1965942382812

# Laser production implementation
# Collected 10 samples in 6.828 seconds
# Average time: 682.218 ms
# Stddev time: 9.549 ms
# Min time: 667.896 ms
# Max time: 693.479 ms
# Perf: 165.997 GFLOP/s

# Display output[0] to make sure it's not optimized away
# 950.1968383789062

# PyTorch Glow: libjit matmul implementation
# Collected 10 samples in 17.060 seconds
# Average time: 1705.967 ms
# Stddev time: 0.332 ms
# Min time: 1705.659 ms
# Max time: 1706.847 ms
# Perf: 66.382 GFLOP/s

# Display output[0] to make sure it's not optimized away
# 950.1965942382812

###############################
# i5-5227U 2.7 GHz Broadwell dual core AVX2

# $ OPENBLAS_NUM_THREADS=1 ./build/bench_gemm
# Warmup: 1.1973 s, result 224 (displayed to avoid compiler optimizing warmup away)
Expand Down
4 changes: 4 additions & 0 deletions laser/cpuinfo.nim
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,10 @@ proc cpuinfo_get_current_core*(): ptr CPUInfo_core {.cpuinfo_proc.}
# Otherwise "curSrcFolder" is ignored
{.passC: "-I" & cpuinfoPath & "src -I" & curSrcFolder & DirSep & "third_party".}

when defined(linux):
{.passC: "-D_GNU_SOURCE".}
{.passL: "-lpthread".}

template compile(path: static string): untyped =
# Path: the path from cpuinfo/src folder
const compiled_object = block:
Expand Down
6 changes: 4 additions & 2 deletions laser/primitives/matrix_multiplication/gemm.nim
Original file line number Diff line number Diff line change
Expand Up @@ -198,11 +198,13 @@ proc gemm_strided*[T: SomeNumber](

when defined(i386) or defined(amd64):
when T is float32:
if cpuinfo_has_x86_fma3(): dispatch(x86_AVX_FMA)
if cpuinfo_has_x86_avx512f(): dispatch(x86_AVX512)
elif cpuinfo_has_x86_fma3(): dispatch(x86_AVX_FMA)
elif cpuinfo_has_x86_avx(): dispatch(x86_AVX)
elif cpuinfo_has_x86_sse(): dispatch(x86_SSE)
elif T is float64:
if cpuinfo_has_x86_fma3(): dispatch(x86_AVX_FMA)
if cpuinfo_has_x86_avx512f(): dispatch(x86_AVX512)
elif cpuinfo_has_x86_fma3(): dispatch(x86_AVX_FMA)
elif cpuinfo_has_x86_avx(): dispatch(x86_AVX)
elif cpuinfo_has_x86_sse2(): dispatch(x86_SSE2)
elif T is int32 or T is uint32:
Expand Down
19 changes: 10 additions & 9 deletions laser/primitives/matrix_multiplication/gemm_tiling.nim
Original file line number Diff line number Diff line change
Expand Up @@ -77,22 +77,23 @@ type
x86_SSE4_1,
x86_AVX,
x86_AVX_FMA,
x86_AVX2
# x86_AVX512 # TODO
x86_AVX2,
x86_AVX512
# Note that Skylake SP, Xeon Bronze Silver and Gold 5XXX
# only have a single AVX512 port and AVX2 can be faster.
# only have a single AVX512 port and AVX2 can be faster
# due to AVX512 downclocking

X86_FeatureMap = array[CPUFeatureX86, int]

const X86_vecsize_float: X86_FeatureMap = [
x86_Generic: 1,
x86_SSE: 128 div 8,
x86_SSE2: 128 div 8,
x86_SSE4_1: 128 div 8,
x86_SSE4_1: 128 div 8,
x86_AVX: 256 div 8,
x86_AVX_FMA: 256 div 8,
x86_AVX2: 256 div 8
# x86_AVX512: 512 div 8
x86_AVX2: 256 div 8,
x86_AVX512: 512 div 8
]

const X86_vecsize_int: X86_FeatureMap = [
Expand All @@ -103,7 +104,7 @@ const X86_vecsize_int: X86_FeatureMap = [
x86_AVX: 128 div 8, # Not even addition with integer AVX
x86_AVX_FMA: 128 div 8,
x86_AVX2: 256 div 8,
# x86_AVX512: 512 div 8
x86_AVX512: 512 div 8
]

# mr * nr < number of registers - 4
Expand All @@ -115,8 +116,8 @@ const X86_regs: X86_FeatureMap = [
x86_SSE4_1: 6,
x86_AVX: 6, # 16 YMM registers
x86_AVX_FMA: 6,
x86_AVX2: 6
# x86_AVX512: 6 # 32 ZMM registers
x86_AVX2: 6,
x86_AVX512: 14 # 32 ZMM registers
]

func x86_ukernel*(cpu: CPUFeatureX86, T: typedesc, c_unit_stride: bool): MicroKernel =
Expand Down
74 changes: 74 additions & 0 deletions laser/primitives/matrix_multiplication/gemm_ukernel_avx512.nim
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Laser
# Copyright (c) 2018 Mamy André-Ratsimbazafy
# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
# This file may not be copied, modified, or distributed except according to those terms.

import
./gemm_ukernel_generator, ./gemm_tiling,
../../simd

ukernel_generator(
x86_AVX512,
typ = float32,
vectype = m512,
nb_scalars = 16,
simd_setZero = mm512_setzero_ps,
simd_broadcast_value = mm512_set1_ps,
simd_load_aligned = mm512_load_ps,
simd_load_unaligned = mm512_loadu_ps,
simd_store_unaligned = mm512_storeu_ps,
simd_mul = mm512_mul_ps,
simd_add = mm512_add_ps,
simd_fma = mm512_fmadd_ps
)

ukernel_generator(
x86_AVX512,
typ = float64,
vectype = m512d,
nb_scalars = 8,
simd_setZero = mm512_setzero_pd,
simd_broadcast_value = mm512_set1_pd,
simd_load_aligned = mm512_load_pd,
simd_load_unaligned = mm512_loadu_pd,
simd_store_unaligned = mm512_storeu_pd,
simd_mul = mm512_mul_pd,
simd_add = mm512_add_pd,
simd_fma = mm512_fmadd_pd
)

template int32x16_muladd_unfused_avx512(a, b, c: m512i): m512i =
mm512_add_epi32(mm512_mullo_epi32(a, b), c)

ukernel_generator(
x86_AVX512,
typ = int32,
vectype = m512i,
nb_scalars = 16,
simd_setZero = mm512_setzero_si512,
simd_broadcast_value = mm512_set1_epi32,
simd_load_aligned = mm512_load_si512,
simd_load_unaligned = mm512_loadu_si512,
simd_store_unaligned = mm512_storeu_si512,
simd_mul = mm512_mullo_epi32,
simd_add = mm512_add_epi32,
simd_fma = int32x16_muladd_unfused_avx512
)

template int64x8_muladd_unfused_avx512(a, b, c: m512i): m512i =
mm512_add_epi64(mm512_mullo_epi64(a, b), c)

ukernel_generator(
x86_AVX512,
typ = int64,
vectype = m512i,
nb_scalars = 8,
simd_setZero = mm512_setzero_si512,
simd_broadcast_value = mm512_set1_epi64,
simd_load_aligned = mm512_load_si512,
simd_load_unaligned = mm512_loadu_si512,
simd_store_unaligned = mm512_storeu_si512,
simd_mul = mm512_mullo_epi64,
simd_add = mm512_add_epi64,
simd_fma = int64x8_muladd_unfused_avx512
)
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ import
./gemm_ukernel_sse4_1,
./gemm_ukernel_avx,
./gemm_ukernel_avx_fma,
./gemm_ukernel_avx2
./gemm_ukernel_avx2,
./gemm_ukernel_avx512

{.experimental: "dynamicBindSym".}

Expand Down
Loading