diff --git a/src/f4/linalg/backend_randomized_threaded.jl b/src/f4/linalg/backend_randomized_threaded.jl index 6d83683a..b83fcfa0 100644 --- a/src/f4/linalg/backend_randomized_threaded.jl +++ b/src/f4/linalg/backend_randomized_threaded.jl @@ -72,16 +72,18 @@ function linalg_randomized_reduce_matrix_lower_part_threaded_cas!( resize!(matrix.some_coeffs, nlow) resize!(matrix.sentinels, ncols) - # If there is a pivot with the leading column i, then sentinels[i] = 1. + # If there is a pivot with the leading column i, then we set + # sentinels[i] = 1. + # If sentinels[i] = 1, and if the pivot is synced, then we set + # sentinels[i] = 2. # Otherwise, sentinels[i] = 0. - # Once sentinels[i] becomes 1, this equality is unchanged. - # NOTE: for all valid indices, isassigned(sentinels, i) must hold. + # Once sentinels[i] becomes 1 or 2, this is unchanged. sentinels = matrix.sentinels @inbounds for i in 1:ncols sentinels[i] = 0 end @inbounds for i in 1:nup - sentinels[matrix.upper_rows[i][1]] = 1 + sentinels[matrix.upper_rows[i][1]] = 2 end # Allocate thread-local buffers @@ -135,7 +137,7 @@ function linalg_randomized_reduce_matrix_lower_part_threaded_cas!( while !success # Reduce the combination by pivots @invariant 1 <= first_nnz_col <= length(t_local_row) - zeroed = linalg_reduce_dense_row_by_pivots_sparse!( + zeroed = linalg_reduce_dense_row_by_pivots_sparse_threadsafe0!( new_sparse_row_support, new_sparse_row_coeffs, t_local_row, @@ -145,6 +147,7 @@ function linalg_randomized_reduce_matrix_lower_part_threaded_cas!( first_nnz_col, ncols, arithmetic, + sentinels, tmp_pos=-1 ) @@ -154,14 +157,6 @@ function linalg_randomized_reduce_matrix_lower_part_threaded_cas!( break end - # At this point, we have discovered a row that is not reduced to - # zero (yet). Two outcomes are possible: - # - either this row is a new pivot row. In this case, - # sentinels[...] is 0. We set sentinels[...] to 1, update the - # pivots and the matrix, and break out of the loop. - # - or, this row can be further reduced. In this case, - # sentinels[...] was already 1. - # Sync point. Everything before this point becomes visible to # other threads once they reach this point. # NOTE: Note the absense of a total order on atomic operations @@ -187,6 +182,20 @@ function linalg_randomized_reduce_matrix_lower_part_threaded_cas!( matrix.some_coeffs[absolute_row_index] = new_sparse_row_coeffs matrix.lower_to_coeffs[new_sparse_row_support[1]] = absolute_row_index pivots[new_sparse_row_support[1]] = new_sparse_row_support + + # This atomic write is paired by an atomic load in + # `linalg_reduce_dense_row_by_pivots_sparse_threadsafe0!`. The + # purpose of these atomics is to fence the above assignment + # pivots[new_sparse_row_support[1]] = new_sparse_row_support + # + # This way, the next atomic load from the same location in + # `sentinels` will also receive all relevant modifications to + # `pivots` as a side-effect. + Atomix.set!( + Atomix.IndexableRef(sentinels, (Int(new_sparse_row_support[1]),)), + Int8(2), + Atomix.release + ) else # go for another iteration first_nnz_col = new_sparse_row_support[1] diff --git a/src/f4/linalg/backend_threaded.jl b/src/f4/linalg/backend_threaded.jl index 45f73e0f..668081aa 100644 --- a/src/f4/linalg/backend_threaded.jl +++ b/src/f4/linalg/backend_threaded.jl @@ -56,16 +56,16 @@ function linalg_reduce_matrix_lower_part_threaded_cas!( # If there is a pivot with the leading column i, then we set # sentinels[i] = 1. - # If sentinels[i] = 1, and the pivot is synced, then we set + # If sentinels[i] = 1, and if the pivot is synced, then we set # sentinels[i] = 2. # Otherwise, sentinels[i] = 0. - # Once sentinels[i] becomes 1 or 2, this equality is unchanged. + # Once sentinels[i] becomes 1 or 2, this is unchanged. sentinels = matrix.sentinels @inbounds for i in 1:ncols sentinels[i] = 0 end @inbounds for i in 1:nup - sentinels[matrix.upper_rows[i][1]] = 1 + sentinels[matrix.upper_rows[i][1]] = 2 end # Allocate thread-local buffers @@ -114,15 +114,6 @@ function linalg_reduce_matrix_lower_part_threaded_cas!( # If the row is fully reduced zeroed && break - # At this point, we have discovered a row that is not reduced to - # zero (yet). Thus, two outcomes are possible: - # 1. sentinels[...] is 0, meaning that this row is potentially a new - # pivot row. We try to set sentinels[...] to 1, update the pivots - # and the matrix, and break out of the loop. - # If assigning sentinels[...] to 1 fails, go to 2. - # 2. sentinels[...] is already 1, meaning that this row can be - # further reduced on the following iteration - # Sync point. Everything before this point becomes visible to # all other threads once they reach this point. # NOTE: Note the absense of a total ordering on atomic operations @@ -154,15 +145,9 @@ function linalg_reduce_matrix_lower_part_threaded_cas!( # This way, the next atomic load from the same location in # `sentinels` will also receive all relevant modifications to # `pivots` as a side-effect. - # - # Roughly speaking, in single-thread, this construction is - # similar to an explicit barrier in C - # asm volatile("" ::: "memory"); - # which can be used to tell the compiler that the writes and - # loads cannot be freely moved below or above the barrier. Atomix.set!( Atomix.IndexableRef(sentinels, (Int(new_sparse_row_support[1]),)), - Int8(1), + Int8(2), Atomix.release ) else @@ -206,34 +191,7 @@ function linalg_reduce_dense_row_by_pivots_sparse_threadsafe0!( continue end - # Say, there are two threads, A and B. Thread A non-atomically writes to - # pivots[i]. Then loading pivots[i] in thread B may produce a value out - # of thin air. - # - # Modifying pivots[i] atomically is impossible in Julia. - # Thus, we may need additional syncronization in this case. - # (Note that this is usually not an issue for x86-64, since it performs - # well-aligned writes and loads atomically by default, but we should not - # rely on that) - # - # From https://en.cppreference.com/w/cpp/atomic/memory_order, if thread - # B performs (at least) acquire atomic memory load, then "all memory - # writes that happened-before the atomic store from the point of view of - # thread A, become visible side-effects in thread B". - # - # In other words, consider the following "sequence" of writes/loads: - # thread A: - # 1. write pivots[i] = X - # 2. write sentinels[i] = 1 (atomic) - # thread B: - # 3. load sentinels[i] (atomic) - # 4. load pivots[i] - # If 2. happened before 3., then 1. happened before 4. - # - # For this reason, we perform the atomic write 2. in - # `linalg_reduce_matrix_lower_part_threaded_cas_maybe_correct!`, and - # perform the atomic load 3. here. - if iszero(Atomix.get(Atomix.IndexableRef(sentinels, (i,)), Atomix.acquire)) + if 2 != Atomix.get(Atomix.IndexableRef(sentinels, (i,)), Atomix.acquire) if new_pivot_column == -1 new_pivot_column = i end @@ -299,313 +257,3 @@ function linalg_reduce_dense_row_by_pivots_sparse_threadsafe0!( false end - -@timeit function linalg_reduce_matrix_lower_part_threaded_task_cas!( - matrix::MacaulayMatrix{CoeffType}, - basis::Basis{CoeffType}, - arithmetic::AbstractArithmetic{AccumType, CoeffType} -) where {CoeffType <: Coeff, AccumType <: Coeff} - _, ncols = size(matrix) - nup, nlow = matrix_nrows_filled(matrix) - - # Calculate the size of the block - nblocks = nthreads() - nblocks = min(nblocks, nlow) - rem = nlow % nblocks == 0 ? 0 : 1 - rowsperblock = div(nlow, nblocks) + rem - - @log level = -3 "" nblocks rem rowsperblock nlow - - # Prepare the matrix - pivots, row_index_to_coeffs = linalg_prepare_matrix_pivots!(matrix) - resize!(matrix.some_coeffs, nlow) - resize!(matrix.sentinels, ncols) - @inbounds for i in 1:ncols - matrix.sentinels[i] = 0 - end - for i in 1:nup - matrix.sentinels[matrix.upper_rows[i][1]] = 1 - end - - tasks = Vector{Task}(undef, nblocks) - - for i in 2:nblocks - block_start = 1 + (i - 1) * rowsperblock - block_end = min(nlow, i * rowsperblock) - block_start > nlow && continue - @invariant 1 <= block_start <= block_end <= nlow - tasks[i] = Base.Threads.@spawn _reduce_matrix_lower_part_threaded_cas_worker!( - matrix, - basis, - arithmetic, - row_index_to_coeffs, - block_start, - block_end - ) - end - - _reduce_matrix_lower_part_threaded_cas_worker!( - matrix, - basis, - arithmetic, - row_index_to_coeffs, - 1, - rowsperblock - ) - - for i in 1:nblocks - if isassigned(tasks, i) - fetch(tasks[i]) - end - end - - true -end - -function _reduce_matrix_lower_part_threaded_task_cas_worker!( - matrix::MacaulayMatrix{CoeffType}, - basis::Basis{CoeffType}, - arithmetic::AbstractArithmetic{AccumType, CoeffType}, - row_index_to_coeffs, - block_start::Int, - block_end::Int -) where {CoeffType <: Coeff, AccumType <: Coeff} - _, ncols = size(matrix) - pivots = matrix.pivots - - row = zeros(AccumType, ncols) - new_sparse_row_support, new_sparse_row_coeffs = linalg_new_empty_sparse_row(CoeffType) - sentinels = matrix.sentinels - - @inbounds for i in block_start:block_end - # Select the row from the lower part of the matrix to be reduced - sparse_row_support = matrix.lower_rows[i] - # Locate the array of coefficients of this row. - # NOTE: no copy of coefficients is needed - sparse_row_coeffs = basis.coeffs[row_index_to_coeffs[i]] - @invariant length(sparse_row_support) == length(sparse_row_coeffs) - - # Load the coefficients into a dense array - linalg_load_sparse_row!(row, sparse_row_support, sparse_row_coeffs) - - success = false - while !success - - # Reduce the row with respect to the known `pivots` from the upper part - # of the matrix. - # NOTE: this also does partial interreduction of the lower matrix rows. - # Upon discovery of a new pivot from the lower part of the matrix, we - # add the pivot to the pool of available pivots - first_nnz_column = sparse_row_support[1] - zeroed = linalg_reduce_dense_row_by_pivots_sparse!( - new_sparse_row_support, - new_sparse_row_coeffs, - row, - matrix, - basis, - pivots, - first_nnz_column, - ncols, - arithmetic, - tmp_pos=-1 - ) - - # If the row is fully reduced - zeroed && break - - @invariant length(new_sparse_row_coeffs) == length(new_sparse_row_support) - - # Set a reference to the coefficients of this row in the matrix - old, success = Atomix.replace!( - Atomix.IndexableRef(sentinels, (Int(new_sparse_row_support[1]),)), - Int8(0), - Int8(1), - Atomix.acquire_release, - Atomix.acquire - ) - - if success - @invariant iszero(old) - linalg_normalize_row!(new_sparse_row_coeffs, arithmetic) - matrix.some_coeffs[i] = new_sparse_row_coeffs - matrix.lower_to_coeffs[new_sparse_row_support[1]] = i - pivots[new_sparse_row_support[1]] = new_sparse_row_support - else - sparse_row_support = new_sparse_row_support - sparse_row_coeffs = new_sparse_row_coeffs - end - end - - new_sparse_row_support, new_sparse_row_coeffs = - linalg_new_empty_sparse_row(CoeffType) - end -end - -@timeit function linalg_reduce_matrix_lower_part_threaded_task_lock_free!( - matrix::MacaulayMatrix{CoeffType}, - basis::Basis{CoeffType}, - arithmetic::AbstractArithmetic{AccumType, CoeffType} -) where {CoeffType <: Coeff, AccumType <: Coeff} - _, ncols = size(matrix) - _, nlow = matrix_nrows_filled(matrix) - - # Calculate the size of the block in threading - nblocks = nthreads() - nblocks = min(nblocks, nlow) - rem = nlow % nblocks == 0 ? 0 : 1 - rowsperblock = div(nlow, nblocks) + rem - - @log level = -3 "" nblocks rem rowsperblock nlow - - # Prepare the matrix - pivots, row_index_to_coeffs = linalg_prepare_matrix_pivots!(matrix) - resize!(matrix.some_coeffs, nlow) - resize!(matrix.sentinels, nlow) - @inbounds for i in 1:nlow - matrix.sentinels[i] = 0 - end - - tasks = Vector{Task}(undef, nblocks) - - for i in 2:nblocks - block_start = 1 + (i - 1) * rowsperblock - block_end = min(nlow, i * rowsperblock) - block_start > nlow && continue - @invariant 1 <= block_start <= block_end <= nlow - tasks[i] = Base.Threads.@spawn _reduce_matrix_lower_part_threaded_lock_free_worker!( - matrix, - basis, - arithmetic, - row_index_to_coeffs, - block_start, - block_end - ) - end - - _reduce_matrix_lower_part_threaded_lock_free_worker!( - matrix, - basis, - arithmetic, - row_index_to_coeffs, - 1, - rowsperblock - ) - - for i in 1:nblocks - if isassigned(tasks, i) - fetch(tasks[i]) - end - end - - row = zeros(AccumType, ncols) - new_sparse_row_support, new_sparse_row_coeffs = linalg_new_empty_sparse_row(CoeffType) - @inbounds for i in 1:nlow - if iszero(matrix.sentinels[i]) - continue - end - - # Select the row from the lower part of the matrix to be reduced - sparse_row_support = matrix.lower_rows[i] - sparse_row_coeffs = matrix.some_coeffs[i] - @invariant length(sparse_row_support) == length(sparse_row_coeffs) - - # Load the coefficients into a dense array - linalg_load_sparse_row!(row, sparse_row_support, sparse_row_coeffs) - - # Reduce the row with respect to the known `pivots` from the upper part - # of the matrix. - # NOTE: this also does partial interreduction of the lower matrix rows. - # Upon discovery of a new pivot from the lower part of the matrix, we - # add the pivot to the pool of available pivots - first_nnz_column = sparse_row_support[1] - zeroed = linalg_reduce_dense_row_by_pivots_sparse!( - new_sparse_row_support, - new_sparse_row_coeffs, - row, - matrix, - basis, - pivots, - first_nnz_column, - ncols, - arithmetic, - tmp_pos=-1 - ) - - # If the row is fully reduced - zeroed && continue - - @invariant length(new_sparse_row_coeffs) == length(new_sparse_row_support) - linalg_normalize_row!(new_sparse_row_coeffs, arithmetic) - - # Store the new row in the matrix, AND add it to the active pivots - matrix.some_coeffs[i] = new_sparse_row_coeffs - pivots[new_sparse_row_support[1]] = new_sparse_row_support - # Set a reference to the coefficients of this row in the matrix - matrix.lower_to_coeffs[new_sparse_row_support[1]] = i - - new_sparse_row_support, new_sparse_row_coeffs = - linalg_new_empty_sparse_row(CoeffType) - end - - true -end - -function _reduce_matrix_lower_part_threaded_task_lock_free_worker!( - matrix::MacaulayMatrix{CoeffType}, - basis::Basis{CoeffType}, - arithmetic::AbstractArithmetic{CoeffType, AccumType}, - row_index_to_coeffs, - block_start::Int, - block_end::Int -) where {CoeffType <: Coeff, AccumType <: Coeff} - _, ncols = size(matrix) - pivots = matrix.pivots - - row = zeros(AccumType, ncols) - new_sparse_row_support, new_sparse_row_coeffs = linalg_new_empty_sparse_row(CoeffType) - - @inbounds for i in block_start:block_end - # Select the row from the lower part of the matrix to be reduced - sparse_row_support = matrix.lower_rows[i] - # Locate the array of coefficients of this row. - # NOTE: no copy of coefficients is needed - sparse_row_coeffs = basis.coeffs[row_index_to_coeffs[i]] - @invariant length(sparse_row_support) == length(sparse_row_coeffs) - - # Load the coefficients into a dense array - linalg_load_sparse_row!(row, sparse_row_support, sparse_row_coeffs) - - # Reduce the row with respect to the known `pivots` from the upper part - # of the matrix. - # NOTE: this also does partial interreduction of the lower matrix rows. - # Upon discovery of a new pivot from the lower part of the matrix, we - # add the pivot to the pool of available pivots - first_nnz_column = sparse_row_support[1] - zeroed = linalg_reduce_dense_row_by_pivots_sparse!( - new_sparse_row_support, - new_sparse_row_coeffs, - row, - matrix, - basis, - pivots, - first_nnz_column, - ncols, - arithmetic, - tmp_pos=-1 - ) - - # If the row is fully reduced - zeroed && continue - - @invariant length(new_sparse_row_coeffs) == length(new_sparse_row_support) - linalg_normalize_row!(new_sparse_row_coeffs, arithmetic) - - # Store the new row in the matrix, AND add it to the active pivots - matrix.some_coeffs[i] = new_sparse_row_coeffs - matrix.lower_rows[i] = new_sparse_row_support - matrix.sentinels[i] = 1 - - new_sparse_row_support, new_sparse_row_coeffs = - linalg_new_empty_sparse_row(CoeffType) - end -end diff --git a/src/f4/trace.jl b/src/f4/trace.jl index 2ed855ae..4155a275 100644 --- a/src/f4/trace.jl +++ b/src/f4/trace.jl @@ -254,26 +254,29 @@ function get_default_trace(wrapped_trace::WrappedTraceF4) end function get_trace!(wrapped_trace::WrappedTraceF4, polynomials::AbstractVector, kwargs) + ring = extract_ring(polynomials) + get_trace!(wrapped_trace, ring.ch, kwargs) +end + +function get_trace!(wrapped_trace::WrappedTraceF4, char, kwargs) trace = get_default_trace(wrapped_trace) # Fast path for the case when there exists a suitable trace coefftype = trace.representation.coefftype - ring = extract_ring(polynomials) if ( - trace.representation.using_wide_type_for_coeffs && - less_than_half(ring.ch, coefftype) - ) || (!trace.representation.using_wide_type_for_coeffs && ring.ch <= typemax(coefftype)) + trace.representation.using_wide_type_for_coeffs && less_than_half(char, coefftype) + ) || (!trace.representation.using_wide_type_for_coeffs && char <= typemax(coefftype)) return trace end for id in keys(wrapped_trace.recorded_traces) - if id[1] <: Integer && ring.ch <= typemax(id[1]) + if id[1] <: Integer && char <= typemax(id[1]) return wrapped_trace.recorded_traces[id] end end # Handle the case when a wider coefficient type is required - new_coefftype = io_get_tight_unsigned_int_type(ring.ch) + new_coefftype = io_get_tight_unsigned_int_type(char) @log level = -2 "Creating a new trace with coefficient type $new_coefftype" new_trace = trace_copy(trace, new_coefftype, deepcopy=false) wrapped_trace.recorded_traces[(new_coefftype, 0)] = new_trace diff --git a/src/groebner/groebner.jl b/src/groebner/groebner.jl index e7194ce7..32f9fca6 100644 --- a/src/groebner/groebner.jl +++ b/src/groebner/groebner.jl @@ -113,13 +113,17 @@ end ### # Learn & Apply startegy -function get_next_batchsize(batchsize, batchsize_multiplier) - new_batchsize = max(batchsize + 1, round(Int, batchsize * batchsize_multiplier)) +function get_next_batchsize(primes_used, batchsize, batchsize_scaling) + new_batchsize = if batchsize == 1 + 2 + else + max(4, round(Int, primes_used * batchsize_scaling)) + end # round to the nearest number divisible by 4 if new_batchsize > 2 new_batchsize::Int = (new_batchsize + 3) & (~3) end - new_batchsize + max(new_batchsize, batchsize) end function _groebner_learn_and_apply( @@ -210,14 +214,13 @@ function _groebner_learn_and_apply( # At this point, either the reconstruction or the correctness check failed. # Continue to compute Groebner bases modulo different primes in batches. + primes_used = 1 batchsize = 1 - batchsize_multiplier = 1.4 + batchsize_scaling = 0.15 @log level = -2 """ Preparing to compute bases in batches.. The initial size of the batch is $batchsize. - The size increases in a geometric progression and is aligned by $(4). - The batch size multiplier is $batchsize_multiplier. - """ + The batch scale factor is $batchsize_scaling.""" # CRT and rational reconstrction settings indices_selection = Vector{Tuple{Int, Int}}(undef, length(state.gb_coeffs_zz)) @@ -238,9 +241,6 @@ function _groebner_learn_and_apply( end resize!(indices_selection, k - 1) unique!(indices_selection) - @log level = -2 "" length(indices_selection) length(state.gb_coeffs_zz) sum( - map(length, state.gb_coeffs_zz) - ) # Initialize partial CRT reconstruction if crt_algorithm === :incremental @@ -290,6 +290,7 @@ function _groebner_learn_and_apply( indices_selection ) end + primes_used += 4 end else for j in 1:batchsize @@ -320,30 +321,31 @@ function _groebner_learn_and_apply( indices_selection ) end + primes_used += 1 end end if crt_algorithm === :simultaneous partial_simultaneous_crt_reconstruct!(state, luckyprimes, indices_selection) end - @log level = -2 "Reconstructing coefficients to QQ" - @log level = -4 "Reconstructing coefficients from Z_$(luckyprimes.modulo * prime) to QQ" + @log level = -2 "Partially reconstructing coefficients to QQ" + @log level = -4 "Partially reconstructing coefficients from Z_$(luckyprimes.modulo * prime) to QQ" success_reconstruct = partial_rational_reconstruct!( state, luckyprimes, indices_selection, params.use_flint ) - @log level = -2 "Reconstruction successfull: $success_reconstruct" + @log level = -2 "Partial reconstruction successfull: $success_reconstruct" @log level = -2 """ - Used $(length(luckyprimes.primes)) primes in total over $(iters + 1) iterations. + Used $(primes_used) primes in total over $(iters + 1) iterations. The current batch size is $batchsize. """ if !success_reconstruct @log level = -2 "Partial rational reconstruction failed" iters += 1 - batchsize = get_next_batchsize(batchsize, batchsize_multiplier) + batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling) continue end @@ -353,7 +355,7 @@ function _groebner_learn_and_apply( if !success_check @log level = -2 "Heuristic check failed for partial reconstruction" iters += 1 - batchsize = get_next_batchsize(batchsize, batchsize_multiplier) + batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling) continue end end @@ -371,7 +373,7 @@ function _groebner_learn_and_apply( if !success_reconstruct @log level = -2 "Full reconstruction failed" iters += 1 - batchsize = get_next_batchsize(batchsize, batchsize_multiplier) + batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling) continue end @@ -387,11 +389,11 @@ function _groebner_learn_and_apply( ) iters += 1 - batchsize = get_next_batchsize(batchsize, batchsize_multiplier) + batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling) end @log level = -2 "Correctness check passed!" - @log level = -2 "Used $(length(luckyprimes.primes)) primes in total over $(iters) iterations" + @log level = -2 "Used $(primes_used) primes in total over $(iters) iterations" # Construct the output basis. # Take monomials from the basis modulo a prime @@ -496,14 +498,13 @@ function _groebner_learn_and_apply_threaded( # At this point, either the reconstruction or the correctness check failed. # Continue to compute Groebner bases modulo different primes in batches. - batchsize = 4 * nthreads() - batchsize_multiplier = 1.4 + primes_used = 1 + batchsize = (min(32, max(4, 2 * nthreads())) + 3) & (~3) + batchsize_scaling = 0.15 @log level = -2 """ Preparing to compute bases in batches.. The initial size of the batch is $batchsize. - The size increases in a geometric progression and is aligned by $(4). - The batch size multiplier is $batchsize_multiplier. - """ + The batch scale factor is $batchsize_scaling.""" # CRT and rational reconstrction settings indices_selection = Vector{Tuple{Int, Int}}(undef, length(state.gb_coeffs_zz)) @@ -524,9 +525,6 @@ function _groebner_learn_and_apply_threaded( end resize!(indices_selection, k - 1) unique!(indices_selection) - @log level = -2 "" length(indices_selection) length(state.gb_coeffs_zz) sum( - map(length, state.gb_coeffs_zz) - ) # Initialize partial CRT reconstruction if crt_algorithm === :incremental @@ -558,7 +556,6 @@ function _groebner_learn_and_apply_threaded( for i in 1:nthreads() empty!(threadbuf_gb_coeffs[i]) end - @log level = -1 "primes are" threadbuf_primes # NOTE: @threads with the option :static guarantees the persistence of # threadid() within a single loop iteration @@ -597,6 +594,7 @@ function _groebner_learn_and_apply_threaded( push!(threadbuf_gb_coeffs[threadid()], (threadlocal_prime_4x[3], gb_coeffs_3)) push!(threadbuf_gb_coeffs[threadid()], (threadlocal_prime_4x[4], gb_coeffs_4)) end + primes_used += batchsize threadbuf_gb_coeffs_union = reduce(vcat, threadbuf_gb_coeffs) @@ -611,15 +609,15 @@ function _groebner_learn_and_apply_threaded( partial_simultaneous_crt_reconstruct!(state, luckyprimes, indices_selection) end - @log level = -2 "Reconstructing coefficients to QQ" - @log level = -4 "Reconstructing coefficients from Z_$(luckyprimes.modulo * prime) to QQ" + @log level = -2 "Partially reconstructing coefficients to QQ" + @log level = -4 "Partially reconstructing coefficients from Z_$(luckyprimes.modulo * prime) to QQ" success_reconstruct = partial_rational_reconstruct!( state, luckyprimes, indices_selection, params.use_flint ) - @log level = -2 "Reconstruction successfull: $success_reconstruct" + @log level = -2 "Partial reconstruction successfull: $success_reconstruct" @log level = -2 """ Used $(length(luckyprimes.primes)) primes in total over $(iters + 1) iterations. The current batch size is $batchsize. @@ -628,7 +626,7 @@ function _groebner_learn_and_apply_threaded( if !success_reconstruct @log level = -2 "Partial rational reconstruction failed" iters += 1 - batchsize = get_next_batchsize(batchsize, batchsize_multiplier) + batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling) continue end @@ -638,7 +636,7 @@ function _groebner_learn_and_apply_threaded( if !success_check @log level = -2 "Heuristic check failed for partial reconstruction" iters += 1 - batchsize = get_next_batchsize(batchsize, batchsize_multiplier) + batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling) continue end end @@ -656,7 +654,7 @@ function _groebner_learn_and_apply_threaded( if !success_reconstruct @log level = -2 "Full reconstruction failed" iters += 1 - batchsize = get_next_batchsize(batchsize, batchsize_multiplier) + batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling) continue end @@ -672,7 +670,7 @@ function _groebner_learn_and_apply_threaded( ) iters += 1 - batchsize = get_next_batchsize(batchsize, batchsize_multiplier) + batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling) end @log level = -2 "Correctness check passed!" @@ -771,14 +769,14 @@ function _groebner_classic_modular( # At this point, either the reconstruction or the correctness check failed. # Continue to compute Groebner bases modulo different primes in batches. + primes_used = 1 batchsize = 1 - batchsize_multiplier = 1.4 + batchsize_scaling = 0.1 @log level = -2 """ - Preparing to compute bases in batches.. - The initial size of the batch is $batchsize. - The size increases in a geometric progression. - The batch size multiplier is $batchsize_multiplier. - """ + Preparing to compute bases in batches.. + The initial size of the batch is $batchsize. + The batch scale factor is $batchsize_scaling.""" + if !tracer.ready_to_use @log level = -2 """ The tracer is disabled until the shape of the basis is not determined via majority vote. @@ -805,9 +803,6 @@ function _groebner_classic_modular( end resize!(indices_selection, k - 1) unique!(indices_selection) - @log level = -2 "" length(indices_selection) length(state.gb_coeffs_zz) sum( - map(length, state.gb_coeffs_zz) - ) if crt_algorithm === :incremental partial_incremental_crt_reconstruct!(state, luckyprimes, indices_selection) @@ -841,20 +836,21 @@ function _groebner_classic_modular( if crt_algorithm === :incremental partial_incremental_crt_reconstruct!(state, luckyprimes, indices_selection) end + primes_used += 1 end if crt_algorithm === :simultaneous partial_simultaneous_crt_reconstruct!(state, luckyprimes, indices_selection) end - @log level = -2 "Reconstructing coefficients to QQ" - @log level = -4 "Reconstructing coefficients from Z_$(luckyprimes.modulo * prime) to QQ" + @log level = -2 "Partially reconstructing coefficients to QQ" + @log level = -4 "Partially reconstructing coefficients from Z_$(luckyprimes.modulo * prime) to QQ" success_reconstruct = partial_rational_reconstruct!( state, luckyprimes, indices_selection, params.use_flint ) - @log level = -2 "Reconstruction successfull: $success_reconstruct" + @log level = -2 "Partial reconstruction successfull: $success_reconstruct" @log level = -2 """ Used $(length(luckyprimes.primes)) primes in total over $(iters + 1) iterations. The current batch size is $batchsize. @@ -863,7 +859,7 @@ function _groebner_classic_modular( if !success_reconstruct @log level = -2 "Partial rational reconstruction failed" iters += 1 - batchsize = get_next_batchsize(batchsize, batchsize_multiplier) + batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling) continue end @@ -873,7 +869,7 @@ function _groebner_classic_modular( if !success_check @log level = -2 "Heuristic check failed for partial reconstruction" iters += 1 - batchsize = get_next_batchsize(batchsize, batchsize_multiplier) + batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling) continue end end @@ -891,7 +887,7 @@ function _groebner_classic_modular( if !success_reconstruct @log level = -2 "Full reconstruction failed" iters += 1 - batchsize = get_next_batchsize(batchsize, batchsize_multiplier) + batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling) continue end @@ -907,7 +903,7 @@ function _groebner_classic_modular( ) iters += 1 - batchsize = get_next_batchsize(batchsize, batchsize_multiplier) + batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling) end @log level = -2 "Correctness check passed!" diff --git a/src/groebner/learn-apply.jl b/src/groebner/learn-apply.jl index e200727a..7c30e70d 100644 --- a/src/groebner/learn-apply.jl +++ b/src/groebner/learn-apply.jl @@ -170,9 +170,10 @@ end #= Several assumptions are in place: -- input contains no zero polynomials, +- input contains no zero polynomials (i.e., no empty vectors or vectors that + contain only zeros), - input coefficients are non-negative, -- input coefficients are smaller than modulo +- input coefficients are smaller than the modulo. =# function groebner_applyX!( wrapped_trace::WrappedTraceF4, @@ -185,7 +186,7 @@ function groebner_applyX!( logging_setup(kws) statistics_setup(kws) - trace = get_default_trace(wrapped_trace) + trace = get_trace!(wrapped_trace, modulo, kws) @log level = -5 "Selected trace" trace.representation.coefftype ring = extract_coeffs_raw_X!(trace, trace.representation, coeffs_zp, modulo, kws) @@ -206,5 +207,5 @@ function groebner_applyX!( dehomogenize_generators!(ring, gb_monoms, gb_coeffs, params) end - flag, gb_coeffs::Vector{Vector{UInt32}} + flag, gb_coeffs end diff --git a/src/input-output/AbstractAlgebra.jl b/src/input-output/AbstractAlgebra.jl index 6c7ced1b..cb8fd77e 100644 --- a/src/input-output/AbstractAlgebra.jl +++ b/src/input-output/AbstractAlgebra.jl @@ -308,6 +308,7 @@ function extract_coeffs_raw_X!( # a hack for homogenized inputs if trace.homogenize + @assert false @assert length(basis.monoms[length(polys) + 1]) == length(basis.coeffs[length(polys) + 1]) == 2 @@ -347,6 +348,9 @@ function _extract_coeffs_raw_X!( basis_cfs = basis.coeffs[i] poly_index = input_polys_perm[i] poly = coeffs_zp[poly_index] + if isempty(poly) + __throw_input_not_supported("Zero input polynomial", poly) + end if !(length(poly) == length(basis_cfs)) __throw_input_not_supported( "Potential coefficient cancellation in input polynomial at index $i on apply stage.", diff --git a/test/fglm/kbase.jl b/test/fglm/kbase.jl index 123bde41..6319377d 100644 --- a/test/fglm/kbase.jl +++ b/test/fglm/kbase.jl @@ -97,8 +97,8 @@ end gb = Groebner.groebner(noon, ordering=Groebner.DegRevLex()) @test length(Groebner.kbase(gb, ordering=Groebner.DegRevLex())) == 21 - noon = Groebner.noonn(7, k=GF(2^31 - 1)) - R = parent(first(noon)) - gb = Groebner.groebner(noon, ordering=Groebner.DegRevLex()) - @test length(Groebner.kbase(gb, ordering=Groebner.DegRevLex())) == 2173 + sys = Groebner.eco10(k=GF(2^31 - 1), ordering=:degrevlex) + R = parent(first(sys)) + gb = Groebner.groebner(sys, ordering=Groebner.DegRevLex()) + @test length(Groebner.kbase(gb, ordering=Groebner.DegRevLex())) == 256 end diff --git a/test/learn_and_apply/learn_and_apply_tmp.jl b/test/learn_and_apply/learn_and_apply_tmp.jl new file mode 100644 index 00000000..d766f77a --- /dev/null +++ b/test/learn_and_apply/learn_and_apply_tmp.jl @@ -0,0 +1,155 @@ +import Random + +params = (loglevel=0, sweep=true) + +@testset "learn & apply, same field" begin + K = AbstractAlgebra.GF(2^31 - 1) + R, (x, y) = polynomial_ring(K, ["x", "y"], ordering=:degrevlex) + trace, gb1 = Groebner.groebner_learn([x + 2, y + 3]) + flag, cfs = Groebner.groebner_applyX!( + trace, + [[UInt32(1), UInt32(3)], [UInt32(1), UInt32(2)]], + UInt32(2^27 - 39) + ) + @assert flag && cfs == [[UInt32(1), UInt32(2)], [UInt32(1), UInt32(3)]] + + cases = [ + (system=[x],), + (system=[R(1)],), + (system=[x, y],), + (system=[x, x],), + (system=[x, y, x * y],), + (system=[x + 1, y + 2, x * y + 3],), + (system=[x^20 * y + x + 1, x * y^20 + y + 1],), + (system=[x^20 * y + x + 1, x * y^20 + y + 1],), + (system=Groebner.noonn(3, ordering=:degrevlex, k=K),), + (system=Groebner.noonn(4, ordering=:degrevlex, k=K),), + (system=Groebner.noonn(5, ordering=:degrevlex, k=K),), + (system=Groebner.katsuran(3, ordering=:degrevlex, k=K),), + (system=Groebner.katsuran(4, ordering=:degrevlex, k=K),), + (system=Groebner.katsuran(5, ordering=:degrevlex, k=K),), + (system=Groebner.cyclicn(5, ordering=:degrevlex, k=K),), + (system=Groebner.rootn(5, ordering=:degrevlex, k=K),), + (system=Groebner.rootn(5, ordering=:degrevlex, k=K),), + (system=Groebner.rootn(6, ordering=:degrevlex, k=K),), + (system=Groebner.eco5(ordering=:degrevlex, k=K),), + (system=Groebner.eco10(ordering=:degrevlex, k=K),), + (system=Groebner.ku10(ordering=:degrevlex, k=K),), + (system=Groebner.kinema(ordering=:degrevlex, k=K),), + (system=Groebner.sparse5(ordering=:degrevlex, k=K),), + (system=Groebner.s9_1(ordering=:degrevlex, k=K),) + ] + + for case in cases + # Learn and apply on the same system + system = case.system + true_gb = Groebner.groebner(system; params...) + trace, gb_1 = Groebner.groebner_learn(system; params...) + cfs = map( + f -> map(c -> UInt32(data(c)), collect(AbstractAlgebra.coefficients(f))), + system + ) + flag, cfs_2 = Groebner.groebner_applyX!( + trace, + cfs, + UInt32(AbstractAlgebra.characteristic(K)); + params... + ) + @test flag && map(f -> collect(coefficients(f)), true_gb) == cfs_2 + end +end + +@testset "learn & apply, different field" begin + # Some small tests and corner cases + K, K2, K3 = GF(5), GF(7), GF(11) + R, (x, y) = polynomial_ring(K, ["x", "y"], ordering=:degrevlex) + R2, (x2, y2) = polynomial_ring(K2, ["x", "y"], ordering=:degrevlex) + R3, (x3, y3) = polynomial_ring(K3, ["x", "y"], ordering=:degrevlex) + system = [x^4 + 2y^3 + 3x^2 + 4y, 4y^4 + 3x^3 + 2y^2 + 1x] + system2 = map(f -> map_coefficients(c -> K2(data(c)), f), system) + + trace, gb_1 = Groebner.groebner_learn(system; params...) + cfs = map( + f -> map(c -> UInt32(data(c)), collect(AbstractAlgebra.coefficients(f))), + system2 + ) + flag, cfs_2 = Groebner.groebner_applyX!( + trace, + cfs, + UInt32(AbstractAlgebra.characteristic(K2)); + params... + ) + @test flag && map(f -> collect(coefficients(f)), Groebner.groebner(system2)) == cfs_2 + + # NOTE: these systems are only relatively very large. + # We should also test for larger systems and larger primes!! + R, (x, y) = polynomial_ring(ZZ, ["x", "y"], ordering=:degrevlex) + cases = [ + (system=[x],), + (system=[R(1)],), + (system=[x, y],), + (system=[x, x],), + (system=[x, y, x * y],), + (system=[x + 1, y + 2, x * y + 3],), + (system=[x^20 * y + x + 1, x * y^20 + y + 1],), + (system=[x^20 * y + x + 1, x * y^20 + y + 1],), + (system=Groebner.noonn(3, ordering=:degrevlex, k=ZZ),), + (system=Groebner.noonn(4, ordering=:degrevlex, k=ZZ),), + (system=Groebner.noonn(5, ordering=:degrevlex, k=ZZ),), + (system=Groebner.katsuran(3, ordering=:degrevlex, k=ZZ),), + (system=Groebner.katsuran(4, ordering=:degrevlex, k=ZZ),), + (system=Groebner.katsuran(5, ordering=:degrevlex, k=ZZ),), + (system=Groebner.katsuran(6, ordering=:degrevlex, k=ZZ),), + (system=Groebner.katsuran(7, ordering=:degrevlex, k=ZZ),), + (system=Groebner.cyclicn(5, ordering=:degrevlex, k=ZZ),), + (system=Groebner.cyclicn(6, ordering=:degrevlex, k=ZZ),), + (system=Groebner.rootn(5, ordering=:degrevlex, k=ZZ),), + (system=Groebner.rootn(5, ordering=:degrevlex, k=ZZ),), + (system=Groebner.rootn(6, ordering=:degrevlex, k=ZZ),), + (system=Groebner.eco5(ordering=:degrevlex, k=ZZ),), + (system=Groebner.eco10(ordering=:degrevlex, k=ZZ),), + (system=Groebner.eco11(ordering=:degrevlex, k=ZZ),), + (system=Groebner.ku10(ordering=:degrevlex, k=ZZ),), + (system=Groebner.kinema(ordering=:degrevlex, k=ZZ),), + (system=Groebner.sparse5(ordering=:degrevlex, k=ZZ),), + (system=Groebner.s9_1(ordering=:degrevlex, k=ZZ),) + ] + + # Some bigger tests + K = AbstractAlgebra.GF(2^31 - 1) + Ks = [ + AbstractAlgebra.GF(2^31 - 1), + AbstractAlgebra.GF(2^30 + 3), + AbstractAlgebra.GF(2^31 + 11), + AbstractAlgebra.GF(2^27 - 39), + AbstractAlgebra.GF(2^20 + 7), + AbstractAlgebra.GF(2^20 + 13), + AbstractAlgebra.GF(2^20 + 25) + ] + for case in cases + # Learn and apply on the same system + system = case.system + system_zp = map(f -> map_coefficients(c -> K(BigInt(c)), f), system) + true_gb = Groebner.groebner(system_zp; params...) + trace, gb_1 = Groebner.groebner_learn(system_zp; params...) + + # Apply on the same system but modulo a different prime + for K_ in Ks + system_zp_2 = map(f -> map_coefficients(c -> K_(BigInt(c)), f), system) + cfs = map( + f -> + map(c -> UInt32(data(c)), collect(AbstractAlgebra.coefficients(f))), + system_zp_2 + ) + flag, cfs_2 = Groebner.groebner_applyX!( + trace, + cfs, + UInt32(AbstractAlgebra.characteristic(K_)); + params... + ) + @test flag && + map(f -> collect(coefficients(f)), Groebner.groebner(system_zp_2)) == + cfs_2 + end + end +end diff --git a/test/runtests.jl b/test/runtests.jl index c66b7995..55e455fd 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -56,7 +56,8 @@ end @time @includetests [ "learn_and_apply/learn_and_apply", - "learn_and_apply/apply_in_batches" + "learn_and_apply/apply_in_batches", + "learn_and_apply/learn_and_apply_tmp" ] @time @includetests ["isgroebner/isgroebner"]