Merge PR #21 from JeffreySarnoff:more_ops

triscale-innov · Dec 23, 2020 · 58b757b · 58b757b
2 parents 9c28058 + 4020535
commit 58b757b
Show file tree

Hide file tree

Showing 5 changed files with 85 additions and 41 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,14 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 
 
+## [Unreleased]
+
+### Added
+
+- Support for additional operations: `neg`, `abs`, `rem`, `muladd`
+
+
+
 ## [0.1.3] - 2020-12-23
 
 ### Changed

diff --git a/README.md b/README.md
@@ -73,7 +73,7 @@ Flop Counter: 1000 flop
 └─────┴─────────┴─────────┘
 
 julia> fieldnames(GFlops.Counter)
-(:fma32, :fma64, :add32, :add64, :sub32, :sub64, :mul32, :mul64, :div32, :div64, :sqrt32, :sqrt64)
+(:fma32, :fma64, :muladd32, :muladd64, :add32, :add64, :sub32, ...)
 
 julia> cnt.add64
 1000
@@ -85,38 +85,28 @@ julia> @gflops mixed_dot($x, $y);
 
 ## Caveats
 
-### FMA - Fused Multiplication and Addition
+### Fused Multiplication and Addition: FMA & MulAdd
 
-On systems which support them, FMAs compute two operations (an addition and a
-multiplication) in one instruction. `@count_ops` counts each individual FMA as
-one operation, which makes it easier to interpret counters. However, `@gflops`
-(and the internal `GFlops.flops` function) will count two floating-point
-operations for each FMA, in accordance to the way high-performance benchmarks
-usually behave:
+On systems which support them, FMAs and MulAdds compute two operations (an
+addition and a multiplication) in one instruction. `@count_ops` counts each
+individual FMA/MulAdd as one operation, which makes it easier to interpret
+counters. However, `@gflops` will count two floating-point operations for each
+FMA, in accordance to the way high-performance benchmarks usually behave:
 
 ```julia
-julia> function fma_dot(x, y)
- acc = zero(eltype(x))
- @inbounds for i in eachindex(x, y)
- acc = fma(x[i], y[i], acc)
- end
- acc
- end
-fma_dot (generic function with 1 method)
-
-julia> x = rand(100); y = rand(100);
-
-# 100 FMAs but 200 flop
-julia> cnt = @count_ops fma_dot($x, $y)
-Flop Counter: 200 flop
-┌─────┬─────────┐
-│ │ Float64 │
-├─────┼─────────┤
-│ fma │ 100 │
-└─────┴─────────┘
-
-julia> @gflops fma_dot($x, $y);
- 1.58 GFlops, 2.12% peak (2.00e+02 flop, 1.27e-07 s, 0 alloc: 0 bytes)
+julia> x = 0.5; coeffs = rand(10);
+
+# 9 MulAdds but 18 flop
+julia> cnt = @count_ops evalpoly($x, $coeffs)
+Flop Counter: 18 flop
+┌────────┬─────────┐
+│ │ Float64 │
+├────────┼─────────┤
+│ muladd │ 9 │
+└────────┴─────────┘
+
+julia> @gflops evalpoly($x, $coeffs);
+ 0.87 GFlops, 1.63% peak (1.80e+01 flop, 2.06e-08 s, 0 alloc: 0 bytes)
 ```
 
 ### Non-julia code

diff --git a/examples.jl b/examples.jl
@@ -22,19 +22,13 @@ cnt.add64
 
 
 
-function fma_dot(x, y)
- acc = zero(eltype(x))
- @inbounds for i in eachindex(x, y)
- acc = fma(x[i], y[i], acc)
- end
- acc
-end
-x = rand(100); y = rand(100);
-println("# 100 FMAs but 200 flop")
-cnt = @count_ops fma_dot($x, $y)
-@gflops fma_dot($x, $y);
+x = 0.5; coeffs = rand(10);
+println("# 9 MulAdds but 18 flop")
+cnt = @count_ops evalpoly($x, $coeffs)
+@gflops evalpoly($x, $coeffs);
 
 
 
 using LinearAlgebra
+x = rand(1000);
 @count_ops dot($x, $y)
diff --git a/src/overdub.jl b/src/overdub.jl
@@ -4,16 +4,20 @@ Cassette.@context CounterCtx;
 
 const ternops = (
  (:fma, Core.Intrinsics.fma_float, 2), # 2 flops per FMA instruction
+ (:muladd, Core.Intrinsics.muladd_float, 2), # 2 flops per muladd instruction
 )
 
 const binops = (
  (:add, Core.Intrinsics.add_float, 1),
  (:sub, Core.Intrinsics.sub_float, 1),
  (:mul, Core.Intrinsics.mul_float, 1),
  (:div, Core.Intrinsics.div_float, 1),
+ (:rem, Core.Intrinsics.rem_float, 1),
 )
 
 const unops = (
+ (:abs, Core.Intrinsics.abs_float, 1),
+ (:neg, Core.Intrinsics.neg_float, 1),
  (:sqrt, Core.Intrinsics.sqrt_llvm, 1),
 )
 

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -105,6 +105,30 @@ end
  end
  end
 
+ @testset "neg" begin
+ let cnt = @count_ops -(4.2)
+ @test cnt.neg64 == 1
+ @test GFlops.flop(cnt) == 1
+ end
+
+ let cnt = @count_ops -(4.2f0)
+ @test cnt.neg32 == 1
+ @test GFlops.flop(cnt) == 1
+ end
+ end
+
+ @testset "abs" begin
+ let cnt = @count_ops abs(-4.2)
+ @test cnt.abs64 == 1
+ @test GFlops.flop(cnt) == 1
+ end
+
+ let cnt = @count_ops abs(-4.2f0)
+ @test cnt.abs32 == 1
+ @test GFlops.flop(cnt) == 1
+ end
+ end
+
  @testset "sqrt" begin
  let cnt = @count_ops sqrt(4.2)
  @test cnt.sqrt64 == 1
@@ -117,6 +141,18 @@ end
  end
  end
 
+ @testset "rem" begin
+ let cnt = @count_ops rem(12.0, 5.0)
+ @test cnt.rem64 == 1
+ @test GFlops.flop(cnt) == 1
+ end
+
+ let cnt = @count_ops rem(12.0f0, 5.0f0)
+ @test cnt.rem32 == 1
+ @test GFlops.flop(cnt) == 1
+ end
+ end
+
  @testset "fma" begin
  let cnt = @count_ops fma(1.0, 2.0, 3.0)
  @test cnt.fma64 == 1
@@ -129,6 +165,18 @@ end
  end
  end
 
+ @testset "muladd" begin
+ let cnt = @count_ops muladd(1.0, 2.0, 3.0)
+ @test cnt.muladd64 == 1
+ @test GFlops.flop(cnt) == 2
+ end
+
+ let cnt = @count_ops muladd(1.0f0, 2.0f0, 3.0f0)
+ @test cnt.muladd32 == 1
+ @test GFlops.flop(cnt) == 2
+ end
+ end
+
  @testset "interpolated arguments" begin
  let N = 100