Improve M1 support (#183)

* Improve M1 support * apple silicon params * No `strict=true` for `makedocs`
JuliaLinearAlgebra · Sep 17, 2023 · ade704c · ade704c · chriselrod · Sep 17, 2023
1 parent dcc448d
commit ade704c
Show file tree

Hide file tree

Showing 10 changed files with 77 additions and 64 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Octavian"
 uuid = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
 authors = ["Chris Elrod", "Dilum Aluthge", "Mason Protter", "contributors"]
-version = "0.3.25"
+version = "0.3.26"
 
 [deps]
 CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9"

diff --git a/benchmark/tilesearch.jl b/benchmark/tilesearch.jl
@@ -133,7 +133,7 @@ function matrix_range(S, ::Type{T} = Float64) where {T}
  Cs, As, Bs
 end
 
-T = Float64
+T = Float32
 min_size = round(
  Int,
  sqrt(

diff --git a/docs/make.jl b/docs/make.jl
@@ -17,7 +17,6 @@ makedocs(;
  "Public API" => "public-api.md",
  "Internals (Private)" => "internals.md"
  ],
- strict = true
 )
 
 deploydocs(; repo = "github.com/JuliaLinearAlgebra/Octavian.jl")
diff --git a/ext/ForwardDiffExt.jl b/ext/ForwardDiffExt.jl
@@ -2,10 +2,7 @@ module ForwardDiffExt
 
 using ForwardDiff: ForwardDiff
 
-using Octavian: ArrayInterface,
- @turbo, @tturbo,
- One, Zero,
- indices, static
+using Octavian: ArrayInterface, @turbo, @tturbo, One, Zero, indices, static
 import Octavian: real_rep, _matmul!, _matmul_serial!
 
 real_rep(a::AbstractArray{DualT}) where {TAG,T,DualT<:ForwardDiff.Dual{TAG,T}} =
@@ -53,9 +50,9 @@ for AbstractVectorOrMatrix in (:AbstractVector, :AbstractMatrix)
  MKN = nothing
  ) where {TAG,T,DualT<:ForwardDiff.Dual{TAG,T}}
  if Bool(ArrayInterface.is_dense(_C)) &&
- Bool(ArrayInterface.is_column_major(_C)) &&
- Bool(ArrayInterface.is_dense(_A)) &&
- Bool(ArrayInterface.is_column_major(_A))
+  Bool(ArrayInterface.is_column_major(_C)) &&
+  Bool(ArrayInterface.is_dense(_A)) &&
+  Bool(ArrayInterface.is_column_major(_A))
  # we can avoid the reshape and call the standard method
  A = reinterpret(T, _A)
  C = reinterpret(T, _C)
@@ -94,9 +91,9 @@ for AbstractVectorOrMatrix in (:AbstractVector, :AbstractMatrix)
  C = real_rep(_C)
  B = real_rep(_B)
  if Bool(ArrayInterface.is_dense(_C)) &&
- Bool(ArrayInterface.is_column_major(_C)) &&
- Bool(ArrayInterface.is_dense(_A)) &&
- Bool(ArrayInterface.is_column_major(_A))
+  Bool(ArrayInterface.is_column_major(_C)) &&
+  Bool(ArrayInterface.is_dense(_A)) &&
+  Bool(ArrayInterface.is_column_major(_A))
  # we can avoid the reshape and call the standard method
  Ar = reinterpret(T, _A)
  Cr = reinterpret(T, _C)
@@ -151,7 +148,7 @@ for AbstractVectorOrMatrix in (:AbstractVector, :AbstractMatrix)
  _C
  end
 
-# multiplication of dual matrix by standard vector/matrix from the right
+ # multiplication of dual matrix by standard vector/matrix from the right
  @eval @inline function _matmul_serial!(
  _C::$(AbstractVectorOrMatrix){DualT},
  _A::AbstractMatrix{DualT},
@@ -161,9 +158,9 @@ for AbstractVectorOrMatrix in (:AbstractVector, :AbstractMatrix)
  MKN
  ) where {TAG,T,DualT<:ForwardDiff.Dual{TAG,T}}
  if Bool(ArrayInterface.is_dense(_C)) &&
- Bool(ArrayInterface.is_column_major(_C)) &&
- Bool(ArrayInterface.is_dense(_A)) &&
- Bool(ArrayInterface.is_column_major(_A))
+  Bool(ArrayInterface.is_column_major(_C)) &&
+  Bool(ArrayInterface.is_dense(_A)) &&
+  Bool(ArrayInterface.is_column_major(_A))
  # we can avoid the reshape and call the standard method
  A = reinterpret(T, _A)
  C = reinterpret(T, _C)
@@ -200,9 +197,9 @@ for AbstractVectorOrMatrix in (:AbstractVector, :AbstractMatrix)
  C = real_rep(_C)
  B = real_rep(_B)
  if Bool(ArrayInterface.is_dense(_C)) &&
- Bool(ArrayInterface.is_column_major(_C)) &&
- Bool(ArrayInterface.is_dense(_A)) &&
- Bool(ArrayInterface.is_column_major(_A))
+  Bool(ArrayInterface.is_column_major(_C)) &&
+  Bool(ArrayInterface.is_dense(_A)) &&
+  Bool(ArrayInterface.is_column_major(_A))
  # we can avoid the reshape and call the standard method
  Ar = reinterpret(T, _A)
  Cr = reinterpret(T, _C)

diff --git a/ext/HyperDualNumbersExt.jl b/ext/HyperDualNumbersExt.jl
@@ -1,10 +1,7 @@
 module HyperDualNumbersExt
 
 using HyperDualNumbers: Hyper
-using Octavian: ArrayInterface,
- @turbo, @tturbo,
- One, Zero,
- indices, static
+using Octavian: ArrayInterface, @turbo, @tturbo, One, Zero, indices, static
 import Octavian: real_rep, _matmul!, _matmul_serial!
 
 real_rep(a::AbstractArray{DualT}) where {T,DualT<:Hyper{T}} =
@@ -23,7 +20,7 @@ for AbstractVectorOrMatrix in (:AbstractVector, :AbstractMatrix)
  nthread::Nothing = nothing,
  MKN = nothing,
  contig_axis = nothing
- ) where {T, DualT<:Hyper{T}}
+ ) where {T,DualT<:Hyper{T}}
  B = real_rep(_B)
  C = real_rep(_C)
 
@@ -52,9 +49,9 @@ for AbstractVectorOrMatrix in (:AbstractVector, :AbstractMatrix)
  MKN = nothing
  ) where {T,DualT<:Hyper{T}}
  if Bool(ArrayInterface.is_dense(_C)) &&
- Bool(ArrayInterface.is_column_major(_C)) &&
- Bool(ArrayInterface.is_dense(_A)) &&
- Bool(ArrayInterface.is_column_major(_A))
+  Bool(ArrayInterface.is_column_major(_C)) &&
+  Bool(ArrayInterface.is_dense(_A)) &&
+  Bool(ArrayInterface.is_column_major(_A))
  # we can avoid the reshape and call the standard method
  A = reinterpret(T, _A)
  C = reinterpret(T, _C)
@@ -93,9 +90,9 @@ for AbstractVectorOrMatrix in (:AbstractVector, :AbstractMatrix)
  C = real_rep(_C)
  B = real_rep(_B)
  if Bool(ArrayInterface.is_dense(_C)) &&
- Bool(ArrayInterface.is_column_major(_C)) &&
- Bool(ArrayInterface.is_dense(_A)) &&
- Bool(ArrayInterface.is_column_major(_A))
+  Bool(ArrayInterface.is_column_major(_C)) &&
+  Bool(ArrayInterface.is_dense(_A)) &&
+  Bool(ArrayInterface.is_column_major(_A))
  # we can avoid the reshape and call the standard method
  Ar = reinterpret(T, _A)
  Cr = reinterpret(T, _C)
@@ -139,7 +136,7 @@ for AbstractVectorOrMatrix in (:AbstractVector, :AbstractMatrix)
  α,
  β,
  MKN
- ) where {T, DualT<:Hyper{T}}
+ ) where {T,DualT<:Hyper{T}}
  B = real_rep(_B)
  C = real_rep(_C)
 
@@ -157,7 +154,7 @@ for AbstractVectorOrMatrix in (:AbstractVector, :AbstractMatrix)
  _C
  end
 
-# multiplication of dual matrix by standard vector/matrix from the right
+ # multiplication of dual matrix by standard vector/matrix from the right
  @eval @inline function _matmul_serial!(
  _C::$(AbstractVectorOrMatrix){DualT},
  _A::AbstractMatrix{DualT},
@@ -167,9 +164,9 @@ for AbstractVectorOrMatrix in (:AbstractVector, :AbstractMatrix)
  MKN
  ) where {T,DualT<:Hyper{T}}
  if Bool(ArrayInterface.is_dense(_C)) &&
- Bool(ArrayInterface.is_column_major(_C)) &&
- Bool(ArrayInterface.is_dense(_A)) &&
- Bool(ArrayInterface.is_column_major(_A))
+  Bool(ArrayInterface.is_column_major(_C)) &&
+  Bool(ArrayInterface.is_dense(_A)) &&
+  Bool(ArrayInterface.is_column_major(_A))
  # we can avoid the reshape and call the standard method
  A = reinterpret(T, _A)
  C = reinterpret(T, _C)
@@ -201,14 +198,14 @@ for AbstractVectorOrMatrix in (:AbstractVector, :AbstractMatrix)
  α,
  β,
  MKN
- ) where {T, DualT<:Hyper{T}}
+ ) where {T,DualT<:Hyper{T}}
  A = real_rep(_A)
  C = real_rep(_C)
  B = real_rep(_B)
  if Bool(ArrayInterface.is_dense(_C)) &&
- Bool(ArrayInterface.is_column_major(_C)) &&
- Bool(ArrayInterface.is_dense(_A)) &&
- Bool(ArrayInterface.is_column_major(_A))
+  Bool(ArrayInterface.is_column_major(_C)) &&
+  Bool(ArrayInterface.is_dense(_A)) &&
+  Bool(ArrayInterface.is_column_major(_A))
  # we can avoid the reshape and call the standard method
  Ar = reinterpret(T, _A)
  Cr = reinterpret(T, _C)
@@ -246,4 +243,4 @@ for AbstractVectorOrMatrix in (:AbstractVector, :AbstractMatrix)
  end
 end # for
 
-end # module
+end # module
diff --git a/src/Octavian.jl b/src/Octavian.jl
@@ -75,5 +75,4 @@ if !isdefined(Base, :get_extension)
  include("../ext/HyperDualNumbersExt.jl")
 end
 
-
 end # module Octavian
diff --git a/src/global_constants.jl b/src/global_constants.jl
@@ -18,6 +18,13 @@ MᵣW_mul_factor(::True) = StaticInt{4}()
 MᵣW_mul_factor(::False) = StaticInt{9}()
 MᵣW_mul_factor() = MᵣW_mul_factor(has_feature(Val(:x86_64_avx512f)))
 
+
+if Sys.ARCH === :aarch64 && (Sys.isapple() || occursin("apple", Sys.CPU_NAME::String))
+ W₁Default() = StaticFloat64{0.23015506935919203}()
+W₂Default() = StaticFloat64{0.16967706087713014}()
+R₁Default() = StaticFloat64{0.9982516031563079}()
+R₂Default() = StaticFloat64{0.5167030291302886}()
+else
 W₁Default(::True) = StaticFloat64{0.0007423708195588264}()
 W₂Default(::True) = StaticFloat64{0.7757548987718677}()
 R₁Default(::True) = StaticFloat64{0.7936663315339363}()
@@ -50,13 +57,14 @@ W₁Default() = W₁Default(has_feature(Val(:x86_64_avx512f)))
 W₂Default() = W₂Default(has_feature(Val(:x86_64_avx512f)))
 R₁Default() = R₁Default(has_feature(Val(:x86_64_avx512f)))
 R₂Default() = R₂Default(has_feature(Val(:x86_64_avx512f)))
-
-@static if Sys.ARCH === :x86_64 || Sys.ARCH === :i686
- first_cache() = StaticInt{2}()
-else
- first_cache() = StaticInt{1}()
 end
 
+# @static if Sys.ARCH === :x86_64 || Sys.ARCH === :i686
+first_cache() = StaticInt{2}()
+# else
+# first_cache() = StaticInt{1}()
+# end
+
 second_cache() = first_cache() + One()
 
 _first_cache_size(fcs::StaticInt) = ifelse(
@@ -69,7 +77,11 @@ first_cache_size() = _first_cache_size(cache_size(first_cache()))
 
 _second_cache_size(scs::StaticInt, ::True) = scs - cache_size(first_cache())
 _second_cache_size(scs::StaticInt, ::False) = scs
-_second_cache_size(::StaticInt{0}, ::Nothing) = StaticInt(3145728)
+@static if (Sys.isapple() || occursin("apple", Sys.CPU_NAME::String)) && Sys.ARCH === :aarch64
+ _second_cache_size(::StaticInt{0}, ::False) = StaticInt(100663296)
+else
+ _second_cache_size(::StaticInt{0}, ::False) = StaticInt(3145728)
+end
 function second_cache_size()
  sc = second_cache()
  _second_cache_size(cache_size(sc), cache_inclusive(sc))

diff --git a/src/init.jl b/src/init.jl
@@ -14,15 +14,13 @@ function __init__()
 end
 
 function init_bcache()
+ BCACHEPTR[] == C_NULL || return
+ c = Threads.nthreads() * second_cache_size()
  if bcache_count() ≢ Zero()
- if BCACHEPTR[] == C_NULL
- BCACHEPTR[] = VectorizationBase.valloc(
- Threads.nthreads() * second_cache_size() * bcache_count(),
- Cvoid,
- ccall(:jl_getpagesize, Int, ())
- )
- end
+ c *= bcache_count()
  end
+ BCACHEPTR[] =
+ VectorizationBase.valloc(c, Cvoid, ccall(:jl_getpagesize, Int, ()))
  nothing
 end
 

diff --git a/test/aqua.jl b/test/aqua.jl
@@ -1,5 +1,9 @@
 @testset "Aqua.jl" begin
- Aqua.test_all(Octavian; ambiguities = false, project_toml_formatting = false,
- stale_deps = (; ignore = [:ForwardDiff]))
+ Aqua.test_all(
+ Octavian;
+ ambiguities = false,
+ project_toml_formatting = false,
+ stale_deps = (; ignore = [:ForwardDiff])
+ )
  @test isempty(Test.detect_ambiguities(Octavian))
 end
diff --git a/test/hyperduals.jl b/test/hyperduals.jl
@@ -26,7 +26,7 @@ end
  @testset "real array from the right" begin
  A1dual = randdual(A1)
  C1dual = randdual(C1)
- 
+
  A2dual = deepcopy(A1dual)
  B2 = deepcopy(B1)
  C2dual = deepcopy(C1dual)
@@ -49,7 +49,6 @@ end
  @test reinterpret(Float64, C1dual) ≈ reinterpret(Float64, C2dual)
  end
 
-
  @testset "transposed arrays" begin
  A1dual = randdual(A1')
  C1dual = randdual(C1)
@@ -67,13 +66,21 @@ end
 
  Cref = zeros(Float64, size(C1)...)
  LinearAlgebra.mul!(Cref, A1, B1)
- @test (reinterpretHD(Float64, C1dual) ≈ reinterpretHD(Float64, C2dual) ≈ 
- reinterpretHD(Float64, C3dual) ≈ reinterpretHD(Float64, C4dual) ≈ Cref) && 
- (reinterpret(Float64, C1dual) ≈ reinterpret(Float64, C2dual) ≈ 
- reinterpret(Float64, C3dual) ≈ reinterpret(Float64, C4dual) )
+ @test (
+ reinterpretHD(Float64, C1dual) ≈
+ reinterpretHD(Float64, C2dual) ≈
+ reinterpretHD(Float64, C3dual) ≈
+ reinterpretHD(Float64, C4dual) ≈
+ Cref
+ ) && (
+ reinterpret(Float64, C1dual) ≈
+ reinterpret(Float64, C2dual) ≈
+ reinterpret(Float64, C3dual) ≈
+ reinterpret(Float64, C4dual)
+ )
  end
 
- @testset "two dual arrays" begin 
+ @testset "two dual arrays" begin
  A1d = randdual(A1)
  B1d = randdual(B1)
  @test reinterpret(Float64, Octavian.matmul(A1d, B1d, 1.3)) ≈