diff --git a/src/f4/linalg/backend_randomized_threaded.jl b/src/f4/linalg/backend_randomized_threaded.jl
index 6d83683a..b83fcfa0 100644
--- a/src/f4/linalg/backend_randomized_threaded.jl
+++ b/src/f4/linalg/backend_randomized_threaded.jl
@@ -72,16 +72,18 @@ function linalg_randomized_reduce_matrix_lower_part_threaded_cas!(
     resize!(matrix.some_coeffs, nlow)
     resize!(matrix.sentinels, ncols)
 
-    # If there is a pivot with the leading column i, then sentinels[i] = 1.
+    # If there is a pivot with the leading column i, then we set
+    #   sentinels[i] = 1.
+    # If sentinels[i] = 1, and if the pivot is synced, then we set
+    #   sentinels[i] = 2.
     # Otherwise, sentinels[i] = 0.
-    # Once sentinels[i] becomes 1, this equality is unchanged.
-    # NOTE: for all valid indices, isassigned(sentinels, i) must hold. 
+    # Once sentinels[i] becomes 1 or 2, this is unchanged.
     sentinels = matrix.sentinels
     @inbounds for i in 1:ncols
         sentinels[i] = 0
     end
     @inbounds for i in 1:nup
-        sentinels[matrix.upper_rows[i][1]] = 1
+        sentinels[matrix.upper_rows[i][1]] = 2
     end
 
     # Allocate thread-local buffers
@@ -135,7 +137,7 @@ function linalg_randomized_reduce_matrix_lower_part_threaded_cas!(
             while !success
                 # Reduce the combination by pivots
                 @invariant 1 <= first_nnz_col <= length(t_local_row)
-                zeroed = linalg_reduce_dense_row_by_pivots_sparse!(
+                zeroed = linalg_reduce_dense_row_by_pivots_sparse_threadsafe0!(
                     new_sparse_row_support,
                     new_sparse_row_coeffs,
                     t_local_row,
@@ -145,6 +147,7 @@ function linalg_randomized_reduce_matrix_lower_part_threaded_cas!(
                     first_nnz_col,
                     ncols,
                     arithmetic,
+                    sentinels,
                     tmp_pos=-1
                 )
 
@@ -154,14 +157,6 @@ function linalg_randomized_reduce_matrix_lower_part_threaded_cas!(
                     break
                 end
 
-                # At this point, we have discovered a row that is not reduced to
-                # zero (yet). Two outcomes are possible: 
-                # - either this row is a new pivot row. In this case,
-                #   sentinels[...] is 0. We set sentinels[...] to 1, update the
-                #   pivots and the matrix, and break out of the loop.
-                # - or, this row can be further reduced. In this case,
-                #   sentinels[...] was already 1.
-
                 # Sync point. Everything before this point becomes visible to
                 # other threads once they reach this point.
                 # NOTE: Note the absense of a total order on atomic operations
@@ -187,6 +182,20 @@ function linalg_randomized_reduce_matrix_lower_part_threaded_cas!(
                     matrix.some_coeffs[absolute_row_index] = new_sparse_row_coeffs
                     matrix.lower_to_coeffs[new_sparse_row_support[1]] = absolute_row_index
                     pivots[new_sparse_row_support[1]] = new_sparse_row_support
+
+                    # This atomic write is paired by an atomic load in
+                    # `linalg_reduce_dense_row_by_pivots_sparse_threadsafe0!`. The
+                    # purpose of these atomics is to fence the above assignment
+                    #    pivots[new_sparse_row_support[1]] = new_sparse_row_support
+                    # 
+                    # This way, the next atomic load from the same location in
+                    # `sentinels` will also receive all relevant modifications to
+                    # `pivots` as a side-effect.
+                    Atomix.set!(
+                        Atomix.IndexableRef(sentinels, (Int(new_sparse_row_support[1]),)),
+                        Int8(2),
+                        Atomix.release
+                    )
                 else
                     # go for another iteration
                     first_nnz_col = new_sparse_row_support[1]
diff --git a/src/f4/linalg/backend_threaded.jl b/src/f4/linalg/backend_threaded.jl
index 45f73e0f..668081aa 100644
--- a/src/f4/linalg/backend_threaded.jl
+++ b/src/f4/linalg/backend_threaded.jl
@@ -56,16 +56,16 @@ function linalg_reduce_matrix_lower_part_threaded_cas!(
 
     # If there is a pivot with the leading column i, then we set
     #   sentinels[i] = 1.
-    # If sentinels[i] = 1, and the pivot is synced, then we set
+    # If sentinels[i] = 1, and if the pivot is synced, then we set
     #   sentinels[i] = 2.
     # Otherwise, sentinels[i] = 0.
-    # Once sentinels[i] becomes 1 or 2, this equality is unchanged.
+    # Once sentinels[i] becomes 1 or 2, this is unchanged.
     sentinels = matrix.sentinels
     @inbounds for i in 1:ncols
         sentinels[i] = 0
     end
     @inbounds for i in 1:nup
-        sentinels[matrix.upper_rows[i][1]] = 1
+        sentinels[matrix.upper_rows[i][1]] = 2
     end
 
     # Allocate thread-local buffers
@@ -114,15 +114,6 @@ function linalg_reduce_matrix_lower_part_threaded_cas!(
             # If the row is fully reduced
             zeroed && break
 
-            # At this point, we have discovered a row that is not reduced to
-            # zero (yet). Thus, two outcomes are possible: 
-            # 1. sentinels[...] is 0, meaning that this row is potentially a new
-            #   pivot row. We try to set sentinels[...] to 1, update the pivots
-            #   and the matrix, and break out of the loop.
-            #   If assigning sentinels[...] to 1 fails, go to 2.
-            # 2. sentinels[...] is already 1, meaning that this row can be
-            #   further reduced on the following iteration
-
             # Sync point. Everything before this point becomes visible to
             # all other threads once they reach this point.
             # NOTE: Note the absense of a total ordering on atomic operations
@@ -154,15 +145,9 @@ function linalg_reduce_matrix_lower_part_threaded_cas!(
                 # This way, the next atomic load from the same location in
                 # `sentinels` will also receive all relevant modifications to
                 # `pivots` as a side-effect.
-                #
-                # Roughly speaking, in single-thread, this construction is
-                # similar to an explicit barrier in C
-                #   asm volatile("" ::: "memory");
-                # which can be used to tell the compiler that the writes and
-                # loads cannot be freely moved below or above the barrier.
                 Atomix.set!(
                     Atomix.IndexableRef(sentinels, (Int(new_sparse_row_support[1]),)),
-                    Int8(1),
+                    Int8(2),
                     Atomix.release
                 )
             else
@@ -206,34 +191,7 @@ function linalg_reduce_dense_row_by_pivots_sparse_threadsafe0!(
             continue
         end
 
-        # Say, there are two threads, A and B. Thread A non-atomically writes to
-        # pivots[i]. Then loading pivots[i] in thread B may produce a value out
-        # of thin air.
-        #
-        # Modifying pivots[i] atomically is impossible in Julia.
-        # Thus, we may need additional syncronization in this case.
-        # (Note that this is usually not an issue for x86-64, since it performs
-        # well-aligned writes and loads atomically by default, but we should not
-        # rely on that)
-        # 
-        # From https://en.cppreference.com/w/cpp/atomic/memory_order, if thread
-        # B performs (at least) acquire atomic memory load, then "all memory
-        # writes that happened-before the atomic store from the point of view of
-        # thread A, become visible side-effects in thread B".
-        #
-        # In other words, consider the following "sequence" of writes/loads:
-        # thread A:
-        #   1. write pivots[i] = X
-        #   2. write sentinels[i] = 1 (atomic)
-        # thread B:
-        #   3. load sentinels[i] (atomic)
-        #   4. load pivots[i]
-        # If 2. happened before 3., then 1. happened before 4.
-        #
-        # For this reason, we perform the atomic write 2. in
-        # `linalg_reduce_matrix_lower_part_threaded_cas_maybe_correct!`, and
-        # perform the atomic load 3. here.
-        if iszero(Atomix.get(Atomix.IndexableRef(sentinels, (i,)), Atomix.acquire))
+        if 2 != Atomix.get(Atomix.IndexableRef(sentinels, (i,)), Atomix.acquire)
             if new_pivot_column == -1
                 new_pivot_column = i
             end
@@ -299,313 +257,3 @@ function linalg_reduce_dense_row_by_pivots_sparse_threadsafe0!(
 
     false
 end
-
-@timeit function linalg_reduce_matrix_lower_part_threaded_task_cas!(
-    matrix::MacaulayMatrix{CoeffType},
-    basis::Basis{CoeffType},
-    arithmetic::AbstractArithmetic{AccumType, CoeffType}
-) where {CoeffType <: Coeff, AccumType <: Coeff}
-    _, ncols = size(matrix)
-    nup, nlow = matrix_nrows_filled(matrix)
-
-    # Calculate the size of the block
-    nblocks = nthreads()
-    nblocks = min(nblocks, nlow)
-    rem = nlow % nblocks == 0 ? 0 : 1
-    rowsperblock = div(nlow, nblocks) + rem
-
-    @log level = -3 "" nblocks rem rowsperblock nlow
-
-    # Prepare the matrix
-    pivots, row_index_to_coeffs = linalg_prepare_matrix_pivots!(matrix)
-    resize!(matrix.some_coeffs, nlow)
-    resize!(matrix.sentinels, ncols)
-    @inbounds for i in 1:ncols
-        matrix.sentinels[i] = 0
-    end
-    for i in 1:nup
-        matrix.sentinels[matrix.upper_rows[i][1]] = 1
-    end
-
-    tasks = Vector{Task}(undef, nblocks)
-
-    for i in 2:nblocks
-        block_start = 1 + (i - 1) * rowsperblock
-        block_end = min(nlow, i * rowsperblock)
-        block_start > nlow && continue
-        @invariant 1 <= block_start <= block_end <= nlow
-        tasks[i] = Base.Threads.@spawn _reduce_matrix_lower_part_threaded_cas_worker!(
-            matrix,
-            basis,
-            arithmetic,
-            row_index_to_coeffs,
-            block_start,
-            block_end
-        )
-    end
-
-    _reduce_matrix_lower_part_threaded_cas_worker!(
-        matrix,
-        basis,
-        arithmetic,
-        row_index_to_coeffs,
-        1,
-        rowsperblock
-    )
-
-    for i in 1:nblocks
-        if isassigned(tasks, i)
-            fetch(tasks[i])
-        end
-    end
-
-    true
-end
-
-function _reduce_matrix_lower_part_threaded_task_cas_worker!(
-    matrix::MacaulayMatrix{CoeffType},
-    basis::Basis{CoeffType},
-    arithmetic::AbstractArithmetic{AccumType, CoeffType},
-    row_index_to_coeffs,
-    block_start::Int,
-    block_end::Int
-) where {CoeffType <: Coeff, AccumType <: Coeff}
-    _, ncols = size(matrix)
-    pivots = matrix.pivots
-
-    row = zeros(AccumType, ncols)
-    new_sparse_row_support, new_sparse_row_coeffs = linalg_new_empty_sparse_row(CoeffType)
-    sentinels = matrix.sentinels
-
-    @inbounds for i in block_start:block_end
-        # Select the row from the lower part of the matrix to be reduced
-        sparse_row_support = matrix.lower_rows[i]
-        # Locate the array of coefficients of this row.
-        # NOTE: no copy of coefficients is needed
-        sparse_row_coeffs = basis.coeffs[row_index_to_coeffs[i]]
-        @invariant length(sparse_row_support) == length(sparse_row_coeffs)
-
-        # Load the coefficients into a dense array
-        linalg_load_sparse_row!(row, sparse_row_support, sparse_row_coeffs)
-
-        success = false
-        while !success
-
-            # Reduce the row with respect to the known `pivots` from the upper part
-            # of the matrix.
-            # NOTE: this also does partial interreduction of the lower matrix rows.
-            # Upon discovery of a new pivot from the lower part of the matrix, we
-            # add the pivot to the pool of available pivots
-            first_nnz_column = sparse_row_support[1]
-            zeroed = linalg_reduce_dense_row_by_pivots_sparse!(
-                new_sparse_row_support,
-                new_sparse_row_coeffs,
-                row,
-                matrix,
-                basis,
-                pivots,
-                first_nnz_column,
-                ncols,
-                arithmetic,
-                tmp_pos=-1
-            )
-
-            # If the row is fully reduced
-            zeroed && break
-
-            @invariant length(new_sparse_row_coeffs) == length(new_sparse_row_support)
-
-            # Set a reference to the coefficients of this row in the matrix
-            old, success = Atomix.replace!(
-                Atomix.IndexableRef(sentinels, (Int(new_sparse_row_support[1]),)),
-                Int8(0),
-                Int8(1),
-                Atomix.acquire_release,
-                Atomix.acquire
-            )
-
-            if success
-                @invariant iszero(old)
-                linalg_normalize_row!(new_sparse_row_coeffs, arithmetic)
-                matrix.some_coeffs[i] = new_sparse_row_coeffs
-                matrix.lower_to_coeffs[new_sparse_row_support[1]] = i
-                pivots[new_sparse_row_support[1]] = new_sparse_row_support
-            else
-                sparse_row_support = new_sparse_row_support
-                sparse_row_coeffs = new_sparse_row_coeffs
-            end
-        end
-
-        new_sparse_row_support, new_sparse_row_coeffs =
-            linalg_new_empty_sparse_row(CoeffType)
-    end
-end
-
-@timeit function linalg_reduce_matrix_lower_part_threaded_task_lock_free!(
-    matrix::MacaulayMatrix{CoeffType},
-    basis::Basis{CoeffType},
-    arithmetic::AbstractArithmetic{AccumType, CoeffType}
-) where {CoeffType <: Coeff, AccumType <: Coeff}
-    _, ncols = size(matrix)
-    _, nlow = matrix_nrows_filled(matrix)
-
-    # Calculate the size of the block in threading
-    nblocks = nthreads()
-    nblocks = min(nblocks, nlow)
-    rem = nlow % nblocks == 0 ? 0 : 1
-    rowsperblock = div(nlow, nblocks) + rem
-
-    @log level = -3 "" nblocks rem rowsperblock nlow
-
-    # Prepare the matrix
-    pivots, row_index_to_coeffs = linalg_prepare_matrix_pivots!(matrix)
-    resize!(matrix.some_coeffs, nlow)
-    resize!(matrix.sentinels, nlow)
-    @inbounds for i in 1:nlow
-        matrix.sentinels[i] = 0
-    end
-
-    tasks = Vector{Task}(undef, nblocks)
-
-    for i in 2:nblocks
-        block_start = 1 + (i - 1) * rowsperblock
-        block_end = min(nlow, i * rowsperblock)
-        block_start > nlow && continue
-        @invariant 1 <= block_start <= block_end <= nlow
-        tasks[i] = Base.Threads.@spawn _reduce_matrix_lower_part_threaded_lock_free_worker!(
-            matrix,
-            basis,
-            arithmetic,
-            row_index_to_coeffs,
-            block_start,
-            block_end
-        )
-    end
-
-    _reduce_matrix_lower_part_threaded_lock_free_worker!(
-        matrix,
-        basis,
-        arithmetic,
-        row_index_to_coeffs,
-        1,
-        rowsperblock
-    )
-
-    for i in 1:nblocks
-        if isassigned(tasks, i)
-            fetch(tasks[i])
-        end
-    end
-
-    row = zeros(AccumType, ncols)
-    new_sparse_row_support, new_sparse_row_coeffs = linalg_new_empty_sparse_row(CoeffType)
-    @inbounds for i in 1:nlow
-        if iszero(matrix.sentinels[i])
-            continue
-        end
-
-        # Select the row from the lower part of the matrix to be reduced
-        sparse_row_support = matrix.lower_rows[i]
-        sparse_row_coeffs = matrix.some_coeffs[i]
-        @invariant length(sparse_row_support) == length(sparse_row_coeffs)
-
-        # Load the coefficients into a dense array
-        linalg_load_sparse_row!(row, sparse_row_support, sparse_row_coeffs)
-
-        # Reduce the row with respect to the known `pivots` from the upper part
-        # of the matrix.
-        # NOTE: this also does partial interreduction of the lower matrix rows.
-        # Upon discovery of a new pivot from the lower part of the matrix, we
-        # add the pivot to the pool of available pivots
-        first_nnz_column = sparse_row_support[1]
-        zeroed = linalg_reduce_dense_row_by_pivots_sparse!(
-            new_sparse_row_support,
-            new_sparse_row_coeffs,
-            row,
-            matrix,
-            basis,
-            pivots,
-            first_nnz_column,
-            ncols,
-            arithmetic,
-            tmp_pos=-1
-        )
-
-        # If the row is fully reduced
-        zeroed && continue
-
-        @invariant length(new_sparse_row_coeffs) == length(new_sparse_row_support)
-        linalg_normalize_row!(new_sparse_row_coeffs, arithmetic)
-
-        # Store the new row in the matrix, AND add it to the active pivots
-        matrix.some_coeffs[i] = new_sparse_row_coeffs
-        pivots[new_sparse_row_support[1]] = new_sparse_row_support
-        # Set a reference to the coefficients of this row in the matrix
-        matrix.lower_to_coeffs[new_sparse_row_support[1]] = i
-
-        new_sparse_row_support, new_sparse_row_coeffs =
-            linalg_new_empty_sparse_row(CoeffType)
-    end
-
-    true
-end
-
-function _reduce_matrix_lower_part_threaded_task_lock_free_worker!(
-    matrix::MacaulayMatrix{CoeffType},
-    basis::Basis{CoeffType},
-    arithmetic::AbstractArithmetic{CoeffType, AccumType},
-    row_index_to_coeffs,
-    block_start::Int,
-    block_end::Int
-) where {CoeffType <: Coeff, AccumType <: Coeff}
-    _, ncols = size(matrix)
-    pivots = matrix.pivots
-
-    row = zeros(AccumType, ncols)
-    new_sparse_row_support, new_sparse_row_coeffs = linalg_new_empty_sparse_row(CoeffType)
-
-    @inbounds for i in block_start:block_end
-        # Select the row from the lower part of the matrix to be reduced
-        sparse_row_support = matrix.lower_rows[i]
-        # Locate the array of coefficients of this row.
-        # NOTE: no copy of coefficients is needed
-        sparse_row_coeffs = basis.coeffs[row_index_to_coeffs[i]]
-        @invariant length(sparse_row_support) == length(sparse_row_coeffs)
-
-        # Load the coefficients into a dense array
-        linalg_load_sparse_row!(row, sparse_row_support, sparse_row_coeffs)
-
-        # Reduce the row with respect to the known `pivots` from the upper part
-        # of the matrix.
-        # NOTE: this also does partial interreduction of the lower matrix rows.
-        # Upon discovery of a new pivot from the lower part of the matrix, we
-        # add the pivot to the pool of available pivots
-        first_nnz_column = sparse_row_support[1]
-        zeroed = linalg_reduce_dense_row_by_pivots_sparse!(
-            new_sparse_row_support,
-            new_sparse_row_coeffs,
-            row,
-            matrix,
-            basis,
-            pivots,
-            first_nnz_column,
-            ncols,
-            arithmetic,
-            tmp_pos=-1
-        )
-
-        # If the row is fully reduced
-        zeroed && continue
-
-        @invariant length(new_sparse_row_coeffs) == length(new_sparse_row_support)
-        linalg_normalize_row!(new_sparse_row_coeffs, arithmetic)
-
-        # Store the new row in the matrix, AND add it to the active pivots
-        matrix.some_coeffs[i] = new_sparse_row_coeffs
-        matrix.lower_rows[i] = new_sparse_row_support
-        matrix.sentinels[i] = 1
-
-        new_sparse_row_support, new_sparse_row_coeffs =
-            linalg_new_empty_sparse_row(CoeffType)
-    end
-end
diff --git a/src/f4/trace.jl b/src/f4/trace.jl
index 2ed855ae..4155a275 100644
--- a/src/f4/trace.jl
+++ b/src/f4/trace.jl
@@ -254,26 +254,29 @@ function get_default_trace(wrapped_trace::WrappedTraceF4)
 end
 
 function get_trace!(wrapped_trace::WrappedTraceF4, polynomials::AbstractVector, kwargs)
+    ring = extract_ring(polynomials)
+    get_trace!(wrapped_trace, ring.ch, kwargs)
+end
+
+function get_trace!(wrapped_trace::WrappedTraceF4, char, kwargs)
     trace = get_default_trace(wrapped_trace)
 
     # Fast path for the case when there exists a suitable trace
     coefftype = trace.representation.coefftype
-    ring = extract_ring(polynomials)
     if (
-        trace.representation.using_wide_type_for_coeffs &&
-        less_than_half(ring.ch, coefftype)
-    ) || (!trace.representation.using_wide_type_for_coeffs && ring.ch <= typemax(coefftype))
+        trace.representation.using_wide_type_for_coeffs && less_than_half(char, coefftype)
+    ) || (!trace.representation.using_wide_type_for_coeffs && char <= typemax(coefftype))
         return trace
     end
 
     for id in keys(wrapped_trace.recorded_traces)
-        if id[1] <: Integer && ring.ch <= typemax(id[1])
+        if id[1] <: Integer && char <= typemax(id[1])
             return wrapped_trace.recorded_traces[id]
         end
     end
 
     # Handle the case when a wider coefficient type is required
-    new_coefftype = io_get_tight_unsigned_int_type(ring.ch)
+    new_coefftype = io_get_tight_unsigned_int_type(char)
     @log level = -2 "Creating a new trace with coefficient type $new_coefftype"
     new_trace = trace_copy(trace, new_coefftype, deepcopy=false)
     wrapped_trace.recorded_traces[(new_coefftype, 0)] = new_trace
diff --git a/src/groebner/groebner.jl b/src/groebner/groebner.jl
index e7194ce7..32f9fca6 100644
--- a/src/groebner/groebner.jl
+++ b/src/groebner/groebner.jl
@@ -113,13 +113,17 @@ end
 ###
 # Learn & Apply startegy
 
-function get_next_batchsize(batchsize, batchsize_multiplier)
-    new_batchsize = max(batchsize + 1, round(Int, batchsize * batchsize_multiplier))
+function get_next_batchsize(primes_used, batchsize, batchsize_scaling)
+    new_batchsize = if batchsize == 1
+        2
+    else
+        max(4, round(Int, primes_used * batchsize_scaling))
+    end
     # round to the nearest number divisible by 4
     if new_batchsize > 2
         new_batchsize::Int = (new_batchsize + 3) & (~3)
     end
-    new_batchsize
+    max(new_batchsize, batchsize)
 end
 
 function _groebner_learn_and_apply(
@@ -210,14 +214,13 @@ function _groebner_learn_and_apply(
 
     # At this point, either the reconstruction or the correctness check failed.
     # Continue to compute Groebner bases modulo different primes in batches. 
+    primes_used = 1
     batchsize = 1
-    batchsize_multiplier = 1.4
+    batchsize_scaling = 0.15
     @log level = -2 """
     Preparing to compute bases in batches.. 
     The initial size of the batch is $batchsize. 
-    The size increases in a geometric progression and is aligned by $(4).
-    The batch size multiplier is $batchsize_multiplier.
-    """
+    The batch scale factor is $batchsize_scaling."""
 
     # CRT and rational reconstrction settings
     indices_selection = Vector{Tuple{Int, Int}}(undef, length(state.gb_coeffs_zz))
@@ -238,9 +241,6 @@ function _groebner_learn_and_apply(
     end
     resize!(indices_selection, k - 1)
     unique!(indices_selection)
-    @log level = -2 "" length(indices_selection) length(state.gb_coeffs_zz) sum(
-        map(length, state.gb_coeffs_zz)
-    )
 
     # Initialize partial CRT reconstruction
     if crt_algorithm === :incremental
@@ -290,6 +290,7 @@ function _groebner_learn_and_apply(
                         indices_selection
                     )
                 end
+                primes_used += 4
             end
         else
             for j in 1:batchsize
@@ -320,30 +321,31 @@ function _groebner_learn_and_apply(
                         indices_selection
                     )
                 end
+                primes_used += 1
             end
         end
         if crt_algorithm === :simultaneous
             partial_simultaneous_crt_reconstruct!(state, luckyprimes, indices_selection)
         end
 
-        @log level = -2 "Reconstructing coefficients to QQ"
-        @log level = -4 "Reconstructing coefficients from Z_$(luckyprimes.modulo * prime) to QQ"
+        @log level = -2 "Partially reconstructing coefficients to QQ"
+        @log level = -4 "Partially reconstructing coefficients from Z_$(luckyprimes.modulo * prime) to QQ"
         success_reconstruct = partial_rational_reconstruct!(
             state,
             luckyprimes,
             indices_selection,
             params.use_flint
         )
-        @log level = -2 "Reconstruction successfull: $success_reconstruct"
+        @log level = -2 "Partial reconstruction successfull: $success_reconstruct"
         @log level = -2 """
-          Used $(length(luckyprimes.primes)) primes in total over $(iters + 1) iterations.
+          Used $(primes_used) primes in total over $(iters + 1) iterations.
           The current batch size is $batchsize.
           """
 
         if !success_reconstruct
             @log level = -2 "Partial rational reconstruction failed"
             iters += 1
-            batchsize = get_next_batchsize(batchsize, batchsize_multiplier)
+            batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling)
             continue
         end
 
@@ -353,7 +355,7 @@ function _groebner_learn_and_apply(
             if !success_check
                 @log level = -2 "Heuristic check failed for partial reconstruction"
                 iters += 1
-                batchsize = get_next_batchsize(batchsize, batchsize_multiplier)
+                batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling)
                 continue
             end
         end
@@ -371,7 +373,7 @@ function _groebner_learn_and_apply(
         if !success_reconstruct
             @log level = -2 "Full reconstruction failed"
             iters += 1
-            batchsize = get_next_batchsize(batchsize, batchsize_multiplier)
+            batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling)
             continue
         end
 
@@ -387,11 +389,11 @@ function _groebner_learn_and_apply(
         )
 
         iters += 1
-        batchsize = get_next_batchsize(batchsize, batchsize_multiplier)
+        batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling)
     end
 
     @log level = -2 "Correctness check passed!"
-    @log level = -2 "Used $(length(luckyprimes.primes)) primes in total over $(iters) iterations"
+    @log level = -2 "Used $(primes_used) primes in total over $(iters) iterations"
 
     # Construct the output basis.
     # Take monomials from the basis modulo a prime
@@ -496,14 +498,13 @@ function _groebner_learn_and_apply_threaded(
 
     # At this point, either the reconstruction or the correctness check failed.
     # Continue to compute Groebner bases modulo different primes in batches. 
-    batchsize = 4 * nthreads()
-    batchsize_multiplier = 1.4
+    primes_used = 1
+    batchsize = (min(32, max(4, 2 * nthreads())) + 3) & (~3)
+    batchsize_scaling = 0.15
     @log level = -2 """
     Preparing to compute bases in batches.. 
     The initial size of the batch is $batchsize. 
-    The size increases in a geometric progression and is aligned by $(4).
-    The batch size multiplier is $batchsize_multiplier.
-    """
+    The batch scale factor is $batchsize_scaling."""
 
     # CRT and rational reconstrction settings
     indices_selection = Vector{Tuple{Int, Int}}(undef, length(state.gb_coeffs_zz))
@@ -524,9 +525,6 @@ function _groebner_learn_and_apply_threaded(
     end
     resize!(indices_selection, k - 1)
     unique!(indices_selection)
-    @log level = -2 "" length(indices_selection) length(state.gb_coeffs_zz) sum(
-        map(length, state.gb_coeffs_zz)
-    )
 
     # Initialize partial CRT reconstruction
     if crt_algorithm === :incremental
@@ -558,7 +556,6 @@ function _groebner_learn_and_apply_threaded(
         for i in 1:nthreads()
             empty!(threadbuf_gb_coeffs[i])
         end
-        @log level = -1 "primes are" threadbuf_primes
 
         # NOTE: @threads with the option :static guarantees the persistence of
         # threadid() within a single loop iteration
@@ -597,6 +594,7 @@ function _groebner_learn_and_apply_threaded(
             push!(threadbuf_gb_coeffs[threadid()], (threadlocal_prime_4x[3], gb_coeffs_3))
             push!(threadbuf_gb_coeffs[threadid()], (threadlocal_prime_4x[4], gb_coeffs_4))
         end
+        primes_used += batchsize
 
         threadbuf_gb_coeffs_union = reduce(vcat, threadbuf_gb_coeffs)
 
@@ -611,15 +609,15 @@ function _groebner_learn_and_apply_threaded(
             partial_simultaneous_crt_reconstruct!(state, luckyprimes, indices_selection)
         end
 
-        @log level = -2 "Reconstructing coefficients to QQ"
-        @log level = -4 "Reconstructing coefficients from Z_$(luckyprimes.modulo * prime) to QQ"
+        @log level = -2 "Partially reconstructing coefficients to QQ"
+        @log level = -4 "Partially reconstructing coefficients from Z_$(luckyprimes.modulo * prime) to QQ"
         success_reconstruct = partial_rational_reconstruct!(
             state,
             luckyprimes,
             indices_selection,
             params.use_flint
         )
-        @log level = -2 "Reconstruction successfull: $success_reconstruct"
+        @log level = -2 "Partial reconstruction successfull: $success_reconstruct"
         @log level = -2 """
           Used $(length(luckyprimes.primes)) primes in total over $(iters + 1) iterations.
           The current batch size is $batchsize.
@@ -628,7 +626,7 @@ function _groebner_learn_and_apply_threaded(
         if !success_reconstruct
             @log level = -2 "Partial rational reconstruction failed"
             iters += 1
-            batchsize = get_next_batchsize(batchsize, batchsize_multiplier)
+            batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling)
             continue
         end
 
@@ -638,7 +636,7 @@ function _groebner_learn_and_apply_threaded(
             if !success_check
                 @log level = -2 "Heuristic check failed for partial reconstruction"
                 iters += 1
-                batchsize = get_next_batchsize(batchsize, batchsize_multiplier)
+                batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling)
                 continue
             end
         end
@@ -656,7 +654,7 @@ function _groebner_learn_and_apply_threaded(
         if !success_reconstruct
             @log level = -2 "Full reconstruction failed"
             iters += 1
-            batchsize = get_next_batchsize(batchsize, batchsize_multiplier)
+            batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling)
             continue
         end
 
@@ -672,7 +670,7 @@ function _groebner_learn_and_apply_threaded(
         )
 
         iters += 1
-        batchsize = get_next_batchsize(batchsize, batchsize_multiplier)
+        batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling)
     end
 
     @log level = -2 "Correctness check passed!"
@@ -771,14 +769,14 @@ function _groebner_classic_modular(
 
     # At this point, either the reconstruction or the correctness check failed.
     # Continue to compute Groebner bases modulo different primes in batches. 
+    primes_used = 1
     batchsize = 1
-    batchsize_multiplier = 1.4
+    batchsize_scaling = 0.1
     @log level = -2 """
-      Preparing to compute bases in batches.. 
-      The initial size of the batch is $batchsize. 
-      The size increases in a geometric progression. 
-      The batch size multiplier is $batchsize_multiplier.
-      """
+    Preparing to compute bases in batches.. 
+    The initial size of the batch is $batchsize. 
+    The batch scale factor is $batchsize_scaling."""
+
     if !tracer.ready_to_use
         @log level = -2 """
           The tracer is disabled until the shape of the basis is not determined via majority vote.
@@ -805,9 +803,6 @@ function _groebner_classic_modular(
     end
     resize!(indices_selection, k - 1)
     unique!(indices_selection)
-    @log level = -2 "" length(indices_selection) length(state.gb_coeffs_zz) sum(
-        map(length, state.gb_coeffs_zz)
-    )
 
     if crt_algorithm === :incremental
         partial_incremental_crt_reconstruct!(state, luckyprimes, indices_selection)
@@ -841,20 +836,21 @@ function _groebner_classic_modular(
             if crt_algorithm === :incremental
                 partial_incremental_crt_reconstruct!(state, luckyprimes, indices_selection)
             end
+            primes_used += 1
         end
         if crt_algorithm === :simultaneous
             partial_simultaneous_crt_reconstruct!(state, luckyprimes, indices_selection)
         end
 
-        @log level = -2 "Reconstructing coefficients to QQ"
-        @log level = -4 "Reconstructing coefficients from Z_$(luckyprimes.modulo * prime) to QQ"
+        @log level = -2 "Partially reconstructing coefficients to QQ"
+        @log level = -4 "Partially reconstructing coefficients from Z_$(luckyprimes.modulo * prime) to QQ"
         success_reconstruct = partial_rational_reconstruct!(
             state,
             luckyprimes,
             indices_selection,
             params.use_flint
         )
-        @log level = -2 "Reconstruction successfull: $success_reconstruct"
+        @log level = -2 "Partial reconstruction successfull: $success_reconstruct"
         @log level = -2 """
           Used $(length(luckyprimes.primes)) primes in total over $(iters + 1) iterations.
           The current batch size is $batchsize.
@@ -863,7 +859,7 @@ function _groebner_classic_modular(
         if !success_reconstruct
             @log level = -2 "Partial rational reconstruction failed"
             iters += 1
-            batchsize = get_next_batchsize(batchsize, batchsize_multiplier)
+            batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling)
             continue
         end
 
@@ -873,7 +869,7 @@ function _groebner_classic_modular(
             if !success_check
                 @log level = -2 "Heuristic check failed for partial reconstruction"
                 iters += 1
-                batchsize = get_next_batchsize(batchsize, batchsize_multiplier)
+                batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling)
                 continue
             end
         end
@@ -891,7 +887,7 @@ function _groebner_classic_modular(
         if !success_reconstruct
             @log level = -2 "Full reconstruction failed"
             iters += 1
-            batchsize = get_next_batchsize(batchsize, batchsize_multiplier)
+            batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling)
             continue
         end
 
@@ -907,7 +903,7 @@ function _groebner_classic_modular(
         )
 
         iters += 1
-        batchsize = get_next_batchsize(batchsize, batchsize_multiplier)
+        batchsize = get_next_batchsize(primes_used, batchsize, batchsize_scaling)
     end
 
     @log level = -2 "Correctness check passed!"
diff --git a/src/groebner/learn-apply.jl b/src/groebner/learn-apply.jl
index e200727a..7c30e70d 100644
--- a/src/groebner/learn-apply.jl
+++ b/src/groebner/learn-apply.jl
@@ -170,9 +170,10 @@ end
 
 #=
 Several assumptions are in place:
-- input contains no zero polynomials,
+- input contains no zero polynomials (i.e., no empty vectors or vectors that
+  contain only zeros),
 - input coefficients are non-negative,
-- input coefficients are smaller than modulo
+- input coefficients are smaller than the modulo.
 =#
 function groebner_applyX!(
     wrapped_trace::WrappedTraceF4,
@@ -185,7 +186,7 @@ function groebner_applyX!(
     logging_setup(kws)
     statistics_setup(kws)
 
-    trace = get_default_trace(wrapped_trace)
+    trace = get_trace!(wrapped_trace, modulo, kws)
     @log level = -5 "Selected trace" trace.representation.coefftype
 
     ring = extract_coeffs_raw_X!(trace, trace.representation, coeffs_zp, modulo, kws)
@@ -206,5 +207,5 @@ function groebner_applyX!(
             dehomogenize_generators!(ring, gb_monoms, gb_coeffs, params)
     end
 
-    flag, gb_coeffs::Vector{Vector{UInt32}}
+    flag, gb_coeffs
 end
diff --git a/src/input-output/AbstractAlgebra.jl b/src/input-output/AbstractAlgebra.jl
index 6c7ced1b..cb8fd77e 100644
--- a/src/input-output/AbstractAlgebra.jl
+++ b/src/input-output/AbstractAlgebra.jl
@@ -308,6 +308,7 @@ function extract_coeffs_raw_X!(
 
     # a hack for homogenized inputs
     if trace.homogenize
+        @assert false
         @assert length(basis.monoms[length(polys) + 1]) ==
                 length(basis.coeffs[length(polys) + 1]) ==
                 2
@@ -347,6 +348,9 @@ function _extract_coeffs_raw_X!(
         basis_cfs = basis.coeffs[i]
         poly_index = input_polys_perm[i]
         poly = coeffs_zp[poly_index]
+        if isempty(poly)
+            __throw_input_not_supported("Zero input polynomial", poly)
+        end
         if !(length(poly) == length(basis_cfs))
             __throw_input_not_supported(
                 "Potential coefficient cancellation in input polynomial at index $i on apply stage.",
diff --git a/test/fglm/kbase.jl b/test/fglm/kbase.jl
index 123bde41..6319377d 100644
--- a/test/fglm/kbase.jl
+++ b/test/fglm/kbase.jl
@@ -97,8 +97,8 @@ end
     gb = Groebner.groebner(noon, ordering=Groebner.DegRevLex())
     @test length(Groebner.kbase(gb, ordering=Groebner.DegRevLex())) == 21
 
-    noon = Groebner.noonn(7, k=GF(2^31 - 1))
-    R = parent(first(noon))
-    gb = Groebner.groebner(noon, ordering=Groebner.DegRevLex())
-    @test length(Groebner.kbase(gb, ordering=Groebner.DegRevLex())) == 2173
+    sys = Groebner.eco10(k=GF(2^31 - 1), ordering=:degrevlex)
+    R = parent(first(sys))
+    gb = Groebner.groebner(sys, ordering=Groebner.DegRevLex())
+    @test length(Groebner.kbase(gb, ordering=Groebner.DegRevLex())) == 256
 end
diff --git a/test/learn_and_apply/learn_and_apply_tmp.jl b/test/learn_and_apply/learn_and_apply_tmp.jl
new file mode 100644
index 00000000..d766f77a
--- /dev/null
+++ b/test/learn_and_apply/learn_and_apply_tmp.jl
@@ -0,0 +1,155 @@
+import Random
+
+params = (loglevel=0, sweep=true)
+
+@testset "learn & apply, same field" begin
+    K = AbstractAlgebra.GF(2^31 - 1)
+    R, (x, y) = polynomial_ring(K, ["x", "y"], ordering=:degrevlex)
+    trace, gb1 = Groebner.groebner_learn([x + 2, y + 3])
+    flag, cfs = Groebner.groebner_applyX!(
+        trace,
+        [[UInt32(1), UInt32(3)], [UInt32(1), UInt32(2)]],
+        UInt32(2^27 - 39)
+    )
+    @assert flag && cfs == [[UInt32(1), UInt32(2)], [UInt32(1), UInt32(3)]]
+
+    cases = [
+        (system=[x],),
+        (system=[R(1)],),
+        (system=[x, y],),
+        (system=[x, x],),
+        (system=[x, y, x * y],),
+        (system=[x + 1, y + 2, x * y + 3],),
+        (system=[x^20 * y + x + 1, x * y^20 + y + 1],),
+        (system=[x^20 * y + x + 1, x * y^20 + y + 1],),
+        (system=Groebner.noonn(3, ordering=:degrevlex, k=K),),
+        (system=Groebner.noonn(4, ordering=:degrevlex, k=K),),
+        (system=Groebner.noonn(5, ordering=:degrevlex, k=K),),
+        (system=Groebner.katsuran(3, ordering=:degrevlex, k=K),),
+        (system=Groebner.katsuran(4, ordering=:degrevlex, k=K),),
+        (system=Groebner.katsuran(5, ordering=:degrevlex, k=K),),
+        (system=Groebner.cyclicn(5, ordering=:degrevlex, k=K),),
+        (system=Groebner.rootn(5, ordering=:degrevlex, k=K),),
+        (system=Groebner.rootn(5, ordering=:degrevlex, k=K),),
+        (system=Groebner.rootn(6, ordering=:degrevlex, k=K),),
+        (system=Groebner.eco5(ordering=:degrevlex, k=K),),
+        (system=Groebner.eco10(ordering=:degrevlex, k=K),),
+        (system=Groebner.ku10(ordering=:degrevlex, k=K),),
+        (system=Groebner.kinema(ordering=:degrevlex, k=K),),
+        (system=Groebner.sparse5(ordering=:degrevlex, k=K),),
+        (system=Groebner.s9_1(ordering=:degrevlex, k=K),)
+    ]
+
+    for case in cases
+        # Learn and apply on the same system
+        system = case.system
+        true_gb = Groebner.groebner(system; params...)
+        trace, gb_1 = Groebner.groebner_learn(system; params...)
+        cfs = map(
+            f -> map(c -> UInt32(data(c)), collect(AbstractAlgebra.coefficients(f))),
+            system
+        )
+        flag, cfs_2 = Groebner.groebner_applyX!(
+            trace,
+            cfs,
+            UInt32(AbstractAlgebra.characteristic(K));
+            params...
+        )
+        @test flag && map(f -> collect(coefficients(f)), true_gb) == cfs_2
+    end
+end
+
+@testset "learn & apply, different field" begin
+    # Some small tests and corner cases
+    K, K2, K3 = GF(5), GF(7), GF(11)
+    R, (x, y) = polynomial_ring(K, ["x", "y"], ordering=:degrevlex)
+    R2, (x2, y2) = polynomial_ring(K2, ["x", "y"], ordering=:degrevlex)
+    R3, (x3, y3) = polynomial_ring(K3, ["x", "y"], ordering=:degrevlex)
+    system = [x^4 + 2y^3 + 3x^2 + 4y, 4y^4 + 3x^3 + 2y^2 + 1x]
+    system2 = map(f -> map_coefficients(c -> K2(data(c)), f), system)
+
+    trace, gb_1 = Groebner.groebner_learn(system; params...)
+    cfs = map(
+        f -> map(c -> UInt32(data(c)), collect(AbstractAlgebra.coefficients(f))),
+        system2
+    )
+    flag, cfs_2 = Groebner.groebner_applyX!(
+        trace,
+        cfs,
+        UInt32(AbstractAlgebra.characteristic(K2));
+        params...
+    )
+    @test flag && map(f -> collect(coefficients(f)), Groebner.groebner(system2)) == cfs_2
+
+    # NOTE: these systems are only relatively very large.
+    # We should also test for larger systems and larger primes!!
+    R, (x, y) = polynomial_ring(ZZ, ["x", "y"], ordering=:degrevlex)
+    cases = [
+        (system=[x],),
+        (system=[R(1)],),
+        (system=[x, y],),
+        (system=[x, x],),
+        (system=[x, y, x * y],),
+        (system=[x + 1, y + 2, x * y + 3],),
+        (system=[x^20 * y + x + 1, x * y^20 + y + 1],),
+        (system=[x^20 * y + x + 1, x * y^20 + y + 1],),
+        (system=Groebner.noonn(3, ordering=:degrevlex, k=ZZ),),
+        (system=Groebner.noonn(4, ordering=:degrevlex, k=ZZ),),
+        (system=Groebner.noonn(5, ordering=:degrevlex, k=ZZ),),
+        (system=Groebner.katsuran(3, ordering=:degrevlex, k=ZZ),),
+        (system=Groebner.katsuran(4, ordering=:degrevlex, k=ZZ),),
+        (system=Groebner.katsuran(5, ordering=:degrevlex, k=ZZ),),
+        (system=Groebner.katsuran(6, ordering=:degrevlex, k=ZZ),),
+        (system=Groebner.katsuran(7, ordering=:degrevlex, k=ZZ),),
+        (system=Groebner.cyclicn(5, ordering=:degrevlex, k=ZZ),),
+        (system=Groebner.cyclicn(6, ordering=:degrevlex, k=ZZ),),
+        (system=Groebner.rootn(5, ordering=:degrevlex, k=ZZ),),
+        (system=Groebner.rootn(5, ordering=:degrevlex, k=ZZ),),
+        (system=Groebner.rootn(6, ordering=:degrevlex, k=ZZ),),
+        (system=Groebner.eco5(ordering=:degrevlex, k=ZZ),),
+        (system=Groebner.eco10(ordering=:degrevlex, k=ZZ),),
+        (system=Groebner.eco11(ordering=:degrevlex, k=ZZ),),
+        (system=Groebner.ku10(ordering=:degrevlex, k=ZZ),),
+        (system=Groebner.kinema(ordering=:degrevlex, k=ZZ),),
+        (system=Groebner.sparse5(ordering=:degrevlex, k=ZZ),),
+        (system=Groebner.s9_1(ordering=:degrevlex, k=ZZ),)
+    ]
+
+    # Some bigger tests
+    K = AbstractAlgebra.GF(2^31 - 1)
+    Ks = [
+        AbstractAlgebra.GF(2^31 - 1),
+        AbstractAlgebra.GF(2^30 + 3),
+        AbstractAlgebra.GF(2^31 + 11),
+        AbstractAlgebra.GF(2^27 - 39),
+        AbstractAlgebra.GF(2^20 + 7),
+        AbstractAlgebra.GF(2^20 + 13),
+        AbstractAlgebra.GF(2^20 + 25)
+    ]
+    for case in cases
+        # Learn and apply on the same system
+        system = case.system
+        system_zp = map(f -> map_coefficients(c -> K(BigInt(c)), f), system)
+        true_gb = Groebner.groebner(system_zp; params...)
+        trace, gb_1 = Groebner.groebner_learn(system_zp; params...)
+
+        # Apply on the same system but modulo a different prime 
+        for K_ in Ks
+            system_zp_2 = map(f -> map_coefficients(c -> K_(BigInt(c)), f), system)
+            cfs = map(
+                f ->
+                    map(c -> UInt32(data(c)), collect(AbstractAlgebra.coefficients(f))),
+                system_zp_2
+            )
+            flag, cfs_2 = Groebner.groebner_applyX!(
+                trace,
+                cfs,
+                UInt32(AbstractAlgebra.characteristic(K_));
+                params...
+            )
+            @test flag &&
+                  map(f -> collect(coefficients(f)), Groebner.groebner(system_zp_2)) ==
+                  cfs_2
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index c66b7995..55e455fd 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -56,7 +56,8 @@ end
 
     @time @includetests [
         "learn_and_apply/learn_and_apply",
-        "learn_and_apply/apply_in_batches"
+        "learn_and_apply/apply_in_batches",
+        "learn_and_apply/learn_and_apply_tmp"
     ]
 
     @time @includetests ["isgroebner/isgroebner"]