From edfaba79625b69e0956f92960c4506e7d5b4955b Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Sat, 6 Jan 2024 12:55:14 +0100 Subject: [PATCH 1/4] Revert "@static if" This reverts commit 314beff49dedafe94639b2d9c426ba0227f01a76. --- src/LoopVectorization.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl index 34583f09..bf5795f0 100644 --- a/src/LoopVectorization.jl +++ b/src/LoopVectorization.jl @@ -20,7 +20,7 @@ export indices, vreduce, vcount -@static if VERSION >= v"1.11-DEV" +if VERSION >= v"1.11-DEV" macro turbo(args...) quote @inbounds @fastmath begin From fc0390a74ea166819b0e81cf0a556f2cca14835e Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Sat, 6 Jan 2024 12:58:38 +0100 Subject: [PATCH 2/4] Revert "Deprecate LV for Julia >= 1.11-DEV (#519)" This reverts commit a4a160f376d53fe4c44e16eafbf838bb243e659f. --- README.md | 4 - ext/ForwardDiffExt.jl | 2 - ext/SpecialFunctionsExt.jl | 2 - src/LoopVectorization.jl | 520 +++++++++++++++++-------------------- test/testsetup.jl | 5 - 5 files changed, 242 insertions(+), 291 deletions(-) diff --git a/README.md b/README.md index 099e903e..569b8b9b 100644 --- a/README.md +++ b/README.md @@ -12,10 +12,6 @@ [![LoopVectorization Downloads](https://shields.io/endpoint?url=https://pkgs.genieframework.com/api/v1/badge/LoopVectorization)](https://pkgs.genieframework.com?packages=LoopVectorization) -# NOTE: Deprecated for Julia v1.11 and above! - -LoopVectorization only works for Julia 1.3 through 1.10. For 1.11 and newer, it simply uses `@inbounds @fastmath` instead, so it should still get roughly the same answer, but both runtime and compile time performance may change dramatically. - ## Installation ```julia diff --git a/ext/ForwardDiffExt.jl b/ext/ForwardDiffExt.jl index f9c20ec5..26227f69 100644 --- a/ext/ForwardDiffExt.jl +++ b/ext/ForwardDiffExt.jl @@ -1,5 +1,4 @@ module ForwardDiffExt -if VERSION < v"1.11-DEV" import ForwardDiff, ChainRulesCore using LoopVectorization, VectorizationBase, SLEEFPirates, ForwardDiff @@ -378,4 +377,3 @@ for f in (:vmapt, :vmapnt, :vmapntt) end end end -end diff --git a/ext/SpecialFunctionsExt.jl b/ext/SpecialFunctionsExt.jl index 2b1ed5fb..bfd813ee 100644 --- a/ext/SpecialFunctionsExt.jl +++ b/ext/SpecialFunctionsExt.jl @@ -1,8 +1,6 @@ module SpecialFunctionsExt -if VERSION < v"1.11-DEV" using SpecialFunctions using LoopVectorization: VectorizationBase using LoopVectorization: AbstractSIMD @inline SpecialFunctions.erf(x::AbstractSIMD) = VectorizationBase.verf(float(x)) end -end diff --git a/src/LoopVectorization.jl b/src/LoopVectorization.jl index bf5795f0..7151df75 100644 --- a/src/LoopVectorization.jl +++ b/src/LoopVectorization.jl @@ -1,10 +1,189 @@ module LoopVectorization -export indices, +if isdefined(Base, :Experimental) && + isdefined(Base.Experimental, Symbol("@max_methods")) + @eval Base.Experimental.@max_methods 1 +end + +using ArrayInterface: UpTri, LoTri +using Static: StaticInt, gt, static, Zero, One, reduce_tup +using VectorizationBase, + SLEEFPirates, UnPack, OffsetArrays, StaticArrayInterface +const ArrayInterface = StaticArrayInterface +using LayoutPointers: + AbstractStridedPointer, + StridedPointer, + StridedBitPointer, + grouped_strided_pointer, + stridedpointer_preserve, + GroupedStridedPointers +import LayoutPointers + +using SIMDTypes: NativeTypes + +using VectorizationBase: + mask, + MM, + AbstractMask, + data, + AbstractSIMD, + vzero, + offsetprecalc, + lazymul, + vadd_nw, + vadd_nsw, + vadd_nuw, + vsub_nw, + vsub_nsw, + vsub_nuw, + vmul_nw, + vmul_nsw, + vmul_nuw, + vfmaddsub, + vfmsubadd, + vpermilps177, + vmovsldup, + vmovshdup, + maybestaticfirst, + maybestaticlast, + gep, + gesp, + vfmadd, + vfmsub, + vfnmadd, + vfnmsub, + vfmadd_fast, + vfmsub_fast, + vfnmadd_fast, + vfnmsub_fast, + vfmadd231, + vfmsub231, + vfnmadd231, + vfnmsub231, + vfma_fast, + vmuladd_fast, + vdiv_fast, + vadd_fast, + vsub_fast, + vmul_fast, + relu, + stridedpointer, + _vload, + _vstore!, + reduced_add, + reduced_prod, + reduce_to_add, + reduce_to_prod, + reduced_max, + reduced_min, + reduce_to_max, + reduce_to_min, + reduced_all, + reduced_any, + reduce_to_all, + reduce_to_any, + vsum, + vprod, + vmaximum, + vminimum, + vany, + vall, + Unroll, + VecUnroll, + preserve_buffer, + zero_vecunroll, + vbroadcast_vecunroll, + _vzero, + _vbroadcast, + contract_add, + collapse_add, + contract_mul, + collapse_mul, + contract_max, + collapse_max, + contract_min, + collapse_min, + contract_and, + collapse_and, + contract_or, + collapse_or, + max_mask, + maybestaticsize, + zero_mask + +using HostCPUFeatures: + pick_vector_width, + register_size, + register_count, + has_opmask_registers, + unwrap, + get_cpu_name +using CPUSummary: num_cores, cache_linesize, cache_size + +using IfElse: ifelse + +using ThreadingUtilities, PolyesterWeave +using Base.Broadcast: Broadcasted, DefaultArrayStyle +using LinearAlgebra: Adjoint, Transpose, Diagonal +using Base.Meta: isexpr +using DocStringExtensions +import LinearAlgebra # for check_args + +using Base: unsafe_trunc + +using Base.FastMath: + add_fast, + sub_fast, + mul_fast, + div_fast, + inv_fast, + abs2_fast, + rem_fast, + max_fast, + min_fast, + pow_fast, + sqrt_fast +using SLEEFPirates: + log_fast, + log2_fast, + log10_fast, + pow, + sin_fast, + cos_fast, + sincos_fast, + tan_fast + +using StaticArrayInterface: + OptionallyStaticUnitRange, + OptionallyStaticRange, + StaticBool, + True, + False, + indices, + static_strides, + offsets, + static_size, + static_axes, + StrideIndex +using CloseOpenIntervals: AbstractCloseOpen, CloseOpen#, SafeCloseOpen +# @static if VERSION ≥ v"1.6.0-rc1" #TODO: delete `else` when dropping 1.5 support +# using ArrayInterface: static_step +# else # Julia 1.5 did not define `step` on CartesianIndices +@inline static_step(x) = ArrayInterface.static_step(x) +@inline static_step(x::CartesianIndices) = + VectorizationBase.CartesianVIndex(map(static_step, x.indices)) +# end + +export LowDimArray, + stridedpointer, + indices, + static, @avx, @avxt, @turbo, @tturbo, + *ˡ, + _turbo_!, vmap, vmap!, vmapt, @@ -13,6 +192,8 @@ export indices, vmapnt!, vmapntt, vmapntt!, + tanh_fast, + sigmoid_fast, vfilter, vfilter!, vmapreduce, @@ -20,288 +201,71 @@ export indices, vreduce, vcount -if VERSION >= v"1.11-DEV" - macro turbo(args...) - quote - @inbounds @fastmath begin - $(esc(last(args))) - end - end - end - const var"@tturbo" = var"@turbo" - const var"@avx" = var"@turbo" - const var"@avxt" = var"@turbo" - const vmap = map - const vmap! = map! - const vmapt = map - const vmapt! = map! - const vmapnt = map - const vmapnt! = map! - const vmapntt = map - const vmapntt! = map! - const vfilter = filter - const vfilter! = filter! - const vmapreduce = mapreduce - const vsum = sum - const vreduce = reduce - const vcount = count - - indices(A::AbstractArray, i::Integer) = axes(A, i) - function _check_axes_match(ax::Tuple) - fax = first(ax) - foreach(Base.tail(ax)) do x - fax == x || throw(DimensionMismatch("Axes do not match.")) - end - fax - end - indices(a::Tuple, b::Tuple) = _check_axes_match(map(axes, a, b)) - indices(a::Tuple, b::Integer) = _check_axes_match(map(Base.Fix2(axes, b), a)) - function indices(a::AbstractArray, b::Tuple) - _check_axes_match(map(Base.Fix1(axes, a), b)) - end - -else - if isdefined(Base, :Experimental) && - isdefined(Base.Experimental, Symbol("@max_methods")) - @eval Base.Experimental.@max_methods 1 - end - export LowDimArray, - static, stridedpointer, *ˡ, _turbo_!, tanh_fast, sigmoid_fast - - using ArrayInterface: UpTri, LoTri - using Static: StaticInt, gt, static, Zero, One, reduce_tup - using VectorizationBase, - SLEEFPirates, UnPack, OffsetArrays, StaticArrayInterface - const ArrayInterface = StaticArrayInterface - using LayoutPointers: - AbstractStridedPointer, - StridedPointer, - StridedBitPointer, - grouped_strided_pointer, - stridedpointer_preserve, - GroupedStridedPointers - import LayoutPointers - - using SIMDTypes: NativeTypes - - using VectorizationBase: - mask, - MM, - AbstractMask, - data, - AbstractSIMD, - vzero, - offsetprecalc, - lazymul, - vadd_nw, - vadd_nsw, - vadd_nuw, - vsub_nw, - vsub_nsw, - vsub_nuw, - vmul_nw, - vmul_nsw, - vmul_nuw, - vfmaddsub, - vfmsubadd, - vpermilps177, - vmovsldup, - vmovshdup, - maybestaticfirst, - maybestaticlast, - gep, - gesp, - vfmadd, - vfmsub, - vfnmadd, - vfnmsub, - vfmadd_fast, - vfmsub_fast, - vfnmadd_fast, - vfnmsub_fast, - vfmadd231, - vfmsub231, - vfnmadd231, - vfnmsub231, - vfma_fast, - vmuladd_fast, - vdiv_fast, - vadd_fast, - vsub_fast, - vmul_fast, - relu, - stridedpointer, - _vload, - _vstore!, - reduced_add, - reduced_prod, - reduce_to_add, - reduce_to_prod, - reduced_max, - reduced_min, - reduce_to_max, - reduce_to_min, - reduced_all, - reduced_any, - reduce_to_all, - reduce_to_any, - vsum, - vprod, - vmaximum, - vminimum, - vany, - vall, - Unroll, - VecUnroll, - preserve_buffer, - zero_vecunroll, - vbroadcast_vecunroll, - _vzero, - _vbroadcast, - contract_add, - collapse_add, - contract_mul, - collapse_mul, - contract_max, - collapse_max, - contract_min, - collapse_min, - contract_and, - collapse_and, - contract_or, - collapse_or, - max_mask, - maybestaticsize, - zero_mask - - using HostCPUFeatures: - pick_vector_width, - register_size, - register_count, - has_opmask_registers, - unwrap, - get_cpu_name - using CPUSummary: num_cores, cache_linesize, cache_size - - using IfElse: ifelse - - using ThreadingUtilities, PolyesterWeave - using Base.Broadcast: Broadcasted, DefaultArrayStyle - using LinearAlgebra: Adjoint, Transpose, Diagonal - using Base.Meta: isexpr - using DocStringExtensions - import LinearAlgebra # for check_args - - using Base: unsafe_trunc - - using Base.FastMath: - add_fast, - sub_fast, - mul_fast, - div_fast, - inv_fast, - abs2_fast, - rem_fast, - max_fast, - min_fast, - pow_fast, - sqrt_fast - using SLEEFPirates: - log_fast, - log2_fast, - log10_fast, - pow, - sin_fast, - cos_fast, - sincos_fast, - tan_fast - - using StaticArrayInterface: - OptionallyStaticUnitRange, - OptionallyStaticRange, - StaticBool, - True, - False, - indices, - static_strides, - offsets, - static_size, - static_axes, - StrideIndex - using CloseOpenIntervals: AbstractCloseOpen, CloseOpen#, SafeCloseOpen - # @static if VERSION ≥ v"1.6.0-rc1" #TODO: delete `else` when dropping 1.5 support - # using ArrayInterface: static_step - # else # Julia 1.5 did not define `step` on CartesianIndices - @inline static_step(x) = ArrayInterface.static_step(x) - @inline static_step(x::CartesianIndices) = - VectorizationBase.CartesianVIndex(map(static_step, x.indices)) - # end +const VECTORWIDTHSYMBOL, ELTYPESYMBOL, MASKSYMBOL = + Symbol("##Wvecwidth##"), Symbol("##Tloopeltype##"), Symbol("##mask##") - const VECTORWIDTHSYMBOL, ELTYPESYMBOL, MASKSYMBOL = - Symbol("##Wvecwidth##"), Symbol("##Tloopeltype##"), Symbol("##mask##") +include("vectorizationbase_compat/contract_pass.jl") +include("vectorizationbase_compat/subsetview.jl") +include("getconstindexes.jl") +include("predicates.jl") +include("simdfunctionals/map.jl") +include("simdfunctionals/filter.jl") +include("modeling/costs.jl") +include("modeling/operations.jl") +include("modeling/graphs.jl") +include("codegen/operation_evaluation_order.jl") +include("parse/memory_ops_common.jl") +include("parse/add_loads.jl") +include("parse/add_stores.jl") +include("parse/add_compute.jl") +include("parse/add_constants.jl") +include("parse/add_ifelse.jl") +include("modeling/determinestrategy.jl") +include("codegen/line_number_nodes.jl") +include("codegen/loopstartstopmanager.jl") +include("codegen/lower_compute.jl") +include("codegen/lower_constant.jl") +include("codegen/lower_memory_common.jl") +include("codegen/lower_load.jl") +include("codegen/lower_store.jl") +include("codegen/lowering.jl") +include("codegen/split_loops.jl") +include("codegen/lower_threads.jl") +include("condense_loopset.jl") +include("transforms.jl") +include("reconstruct_loopset.jl") +include("constructors.jl") +include("user_api_conveniences.jl") +include("simdfunctionals/mapreduce.jl") +include("simdfunctionals/count.jl") +include("broadcast.jl") - include("vectorizationbase_compat/contract_pass.jl") - include("vectorizationbase_compat/subsetview.jl") - include("getconstindexes.jl") - include("predicates.jl") - include("simdfunctionals/map.jl") - include("simdfunctionals/filter.jl") - include("modeling/costs.jl") - include("modeling/operations.jl") - include("modeling/graphs.jl") - include("codegen/operation_evaluation_order.jl") - include("parse/memory_ops_common.jl") - include("parse/add_loads.jl") - include("parse/add_stores.jl") - include("parse/add_compute.jl") - include("parse/add_constants.jl") - include("parse/add_ifelse.jl") - include("modeling/determinestrategy.jl") - include("codegen/line_number_nodes.jl") - include("codegen/loopstartstopmanager.jl") - include("codegen/lower_compute.jl") - include("codegen/lower_constant.jl") - include("codegen/lower_memory_common.jl") - include("codegen/lower_load.jl") - include("codegen/lower_store.jl") - include("codegen/lowering.jl") - include("codegen/split_loops.jl") - include("codegen/lower_threads.jl") - include("condense_loopset.jl") - include("transforms.jl") - include("reconstruct_loopset.jl") - include("constructors.jl") - include("user_api_conveniences.jl") - include("simdfunctionals/mapreduce.jl") - include("simdfunctionals/count.jl") - include("broadcast.jl") +""" +LoopVectorization provides macros and functions that combine SIMD vectorization and +loop-reordering so as to improve performance: - """ - LoopVectorization provides macros and functions that combine SIMD vectorization and - loop-reordering so as to improve performance: + - [`@turbo`](@ref): transform `for`-loops and broadcasting + - [`vmapreduce`](@ref): vectorized version of `mapreduce` + - [`vreduce`](@ref): vectorized version of `reduce` + - [`vsum`](@ref): vectorized version of `sum` + - [`vmap`](@ref) and `vmap!`: vectorized version of `map` and `map!` + - [`vmapnt`](@ref) and `vmapnt!`: non-temporal variants of `vmap` and `vmap!` + - [`vmapntt`](@ref) and `vmapntt!`: threaded variants of `vmapnt` and `vmapnt!` + - [`vfilter`](@ref) and `vfilter!`: vectorized versions of `filter` and `filter!` +""" +LoopVectorization - - [`@turbo`](@ref): transform `for`-loops and broadcasting - - [`vmapreduce`](@ref): vectorized version of `mapreduce` - - [`vreduce`](@ref): vectorized version of `reduce` - - [`vsum`](@ref): vectorized version of `sum` - - [`vmap`](@ref) and `vmap!`: vectorized version of `map` and `map!` - - [`vmapnt`](@ref) and `vmapnt!`: non-temporal variants of `vmap` and `vmap!` - - [`vmapntt`](@ref) and `vmapntt!`: threaded variants of `vmapnt` and `vmapnt!` - - [`vfilter`](@ref) and `vfilter!`: vectorized versions of `filter` and `filter!` - """ - LoopVectorization +include("precompile.jl") +# _precompile_() - include("precompile.jl") - # _precompile_() +# _vreduce(+, Float64[1.0]) +# matmul_params(64, 32, 64) - # _vreduce(+, Float64[1.0]) - # matmul_params(64, 32, 64) +# import ChainRulesCore, ForwardDiff +# include("vmap_grad.jl") +if !isdefined(Base, :get_extension) + include("../ext/ForwardDiffExt.jl") + include("../ext/SpecialFunctionsExt.jl") +end - # import ChainRulesCore, ForwardDiff - # include("vmap_grad.jl") - if !isdefined(Base, :get_extension) - include("../ext/ForwardDiffExt.jl") - include("../ext/SpecialFunctionsExt.jl") - end -end # if VERSION end # module diff --git a/test/testsetup.jl b/test/testsetup.jl index df27ccaa..b60c2022 100644 --- a/test/testsetup.jl +++ b/test/testsetup.jl @@ -2,12 +2,7 @@ using Test using Pkg using LoopVectorization -if VERSION >= v"1.11-DEV" -const var"@_avx" = LoopVectorization.var"@turbo" -else const var"@_avx" = LoopVectorization.var"@_turbo" -end - using LinearAlgebra function clenshaw(x, coeff) From 8ee0bdddc35a4551460c69a17375dac47b26351b Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Sat, 6 Jan 2024 13:04:41 +0100 Subject: [PATCH 3/4] Deprecation notice. --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 569b8b9b..099e903e 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,10 @@ [![LoopVectorization Downloads](https://shields.io/endpoint?url=https://pkgs.genieframework.com/api/v1/badge/LoopVectorization)](https://pkgs.genieframework.com?packages=LoopVectorization) +# NOTE: Deprecated for Julia v1.11 and above! + +LoopVectorization only works for Julia 1.3 through 1.10. For 1.11 and newer, it simply uses `@inbounds @fastmath` instead, so it should still get roughly the same answer, but both runtime and compile time performance may change dramatically. + ## Installation ```julia From 5cf8ddb56695babd8de05a5d626e4e25c9adbfa5 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 8 Jan 2024 20:29:46 +0100 Subject: [PATCH 4/4] Remove broken register passing optimization. --- Project.toml | 2 +- src/codegen/lower_threads.jl | 8 ++--- src/condense_loopset.jl | 69 ++---------------------------------- src/reconstruct_loopset.jl | 22 ++---------- 4 files changed, 9 insertions(+), 92 deletions(-) diff --git a/Project.toml b/Project.toml index 21fd26c3..2d028c05 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "LoopVectorization" uuid = "bdcacae8-1622-11e9-2a5c-532679323890" authors = ["Chris Elrod "] -version = "0.12.168" +version = "0.12.169" [deps] ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" diff --git a/src/codegen/lower_threads.jl b/src/codegen/lower_threads.jl index 6c268a30..40f8b541 100644 --- a/src/codegen/lower_threads.jl +++ b/src/codegen/lower_threads.jl @@ -495,7 +495,7 @@ function thread_one_loops_expr( $AM, $LPSYM, Val(typeof(var"#avx#call#args#")), - flatten_to_tuple(var"#avx#call#args#")... + var"#avx#call#args#"... )) update_return_values = if length(ls.outer_reductions) > 0 retv = loopset_return_value(ls, Val(false)) @@ -555,7 +555,7 @@ function thread_one_loops_expr( $AM, $LPSYM, StaticType{typeof(var"##lbvargs#to_launch##")}(), - flatten_to_tuple(var"##lbvargs#to_launch##"), + var"##lbvargs#to_launch##", var"#thread#id#" ) @@ -744,7 +744,7 @@ function thread_two_loops_expr( $AM, $LPSYM, Val(typeof(var"#avx#call#args#")), - flatten_to_tuple(var"#avx#call#args#")... + var"#avx#call#args#"... )) update_return_values = if length(ls.outer_reductions) > 0 retv = loopset_return_value(ls, Val(false)) @@ -867,7 +867,7 @@ function thread_two_loops_expr( $AM, $LPSYM, StaticType{typeof(var"##lbvargs#to_launch##")}(), - flatten_to_tuple(var"##lbvargs#to_launch##"), + var"##lbvargs#to_launch##", var"#thread#id#" ) var"#thread#mask#" >>>= var"#trailzing#zeros#" diff --git a/src/condense_loopset.jl b/src/condense_loopset.jl index fd0411e8..69c65933 100644 --- a/src/condense_loopset.jl +++ b/src/condense_loopset.jl @@ -4,71 +4,6 @@ Base.:|(u::Unsigned, it::IndexType) = u | UInt8(it) Base.:(==)(u::Unsigned, it::IndexType) = (u % UInt8) == UInt8(it) -function _append_fields!(t::Expr, body::Expr, sym::Symbol, ::Type{T}) where {T} - for f ∈ 1:fieldcount(T) - TF = fieldtype(T, f) - Base.issingletontype(TF) && continue - gfcall = Expr(:call, getfield, sym, f) - if fieldcount(TF) ≡ 0 - push!(t.args, gfcall) - elseif TF <: DataType - push!(t.args, Expr(:call, Expr(:curly, lv(:StaticType), gfcall))) - else - newsym = gensym(sym) - push!(body.args, Expr(:(=), newsym, gfcall)) - _append_fields!(t, body, newsym, TF) - end - end - return nothing -end -@generated function flatten_to_tuple(r::T) where {T} - body = Expr(:block, Expr(:meta, :inline)) - t = Expr(:tuple) - if Base.issingletontype(T) - nothing - elseif fieldcount(T) ≡ 0 - push!(t.args, :r) - elseif T <: DataType - push!(t.args, Expr(:call, Expr(:curly, lv(:StaticType), :r))) - else - _append_fields!(t, body, :r, T) - end - push!(body.args, t) - body -end -function rebuild_fields(offset::Int, ::Type{T}) where {T} - call = (T <: Tuple) ? Expr(:tuple) : Expr(:new, T) - for f ∈ 1:fieldcount(T) - TF = fieldtype(T, f) - if Base.issingletontype(TF) - push!(call.args, TF.instance) - elseif fieldcount(TF) ≡ 0 - push!(call.args, Expr(:call, getfield, :t, (offset += 1))) - elseif TF <: DataType - push!( - call.args, - Expr(:call, lv(:gettype), Expr(:call, getfield, :t, (offset += 1))) - ) - else - arg, offset = rebuild_fields(offset, TF) - push!(call.args, arg) - end - end - return call, offset -end -@generated function reassemble_tuple(::Type{T}, t::Tuple) where {T} - if Base.issingletontype(T) - return T.instance - elseif fieldcount(T) ≡ 0 - call = Expr(:call, getfield, :t, 1) - elseif T <: DataType - call = Expr(:call, lv(:gettype), Expr(:call, getfield, :t, 1)) - else - call, _ = rebuild_fields(0, T) - end - Expr(:block, Expr(:meta, :inline), call) -end - """ ArrayRefStruct @@ -893,9 +828,9 @@ function generate_call_types( ) ) if manyarg - push!(q.args, Expr(:call, lv(:flatten_to_tuple), vargsym)) + push!(q.args, vargsym) else - push!(q.args, Expr(:(...), Expr(:call, lv(:flatten_to_tuple), vargsym))) + push!(q.args, Expr(:(...), vargsym)) end Expr(:block, Expr(:(=), vargsym, Expr(:tuple, lbarg, extra_args))) end diff --git a/src/reconstruct_loopset.jl b/src/reconstruct_loopset.jl index d044fbeb..51c93ffb 100644 --- a/src/reconstruct_loopset.jl +++ b/src/reconstruct_loopset.jl @@ -1111,7 +1111,7 @@ Execute an `@turbo` block. The block's code is represented via the arguments: ::Val{var"#AM#"}, ::Val{var"#LPSYM#"}, ::Val{Tuple{var"#LB#",var"#V#"}}, - var"#flattened#var#arguments#"::Vararg{Any,var"#num#vargs#"} + var"#lv#tuple#args#"::Vararg{Any,var"#num#vargs#"} ) where { var"#UNROLL#", var"#OPS#", @@ -1132,15 +1132,6 @@ Execute an `@turbo` block. The block's code is represented via the arguments: var"#V#".parameters, var"#UNROLL#" ) - pushfirst!( - ls.preamble.args, - :( - var"#lv#tuple#args#" = reassemble_tuple( - Tuple{var"#LB#",var"#V#"}, - var"#flattened#var#arguments#" - ) - ) - ) post = hoist_constant_memory_accesses!(ls) # q = @show(avx_body(ls, var"#UNROLL#")); post === ls.preamble ? q : Expr(:block, q, post) q = if (var"#UNROLL#"[10] > 1) && length(var"#LPSYM#") == length(ls.loops) @@ -1171,7 +1162,7 @@ end ::Val{var"#AM#"}, ::Val{var"#LPSYM#"}, ::Val{Tuple{var"#LB#",var"#V#"}}, - var"#flattened#var#arguments#"::Tuple{Vararg{Any,var"#num#vargs#"}} + var"#lv#tuple#args#"::Tuple{Vararg{Any,var"#num#vargs#"}} ) where { var"#UNROLL#", var"#OPS#", @@ -1192,15 +1183,6 @@ end var"#V#".parameters, var"#UNROLL#" ) - pushfirst!( - ls.preamble.args, - :( - var"#lv#tuple#args#" = reassemble_tuple( - Tuple{var"#LB#",var"#V#"}, - var"#flattened#var#arguments#" - ) - ) - ) post = hoist_constant_memory_accesses!(ls) # q = @show(avx_body(ls, var"#UNROLL#")); post === ls.preamble ? q : Expr(:block, q, post) q = if (var"#UNROLL#"[10] > 1) && length(var"#LPSYM#") == length(ls.loops)