@@ -21,8 +21,8 @@ int jl_n_sweepthreads;
21
21
_Atomic(int ) gc_n_threads_marking ;
22
22
// Number of threads sweeping
23
23
_Atomic(int ) gc_n_threads_sweeping ;
24
- // Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping
25
- _Atomic(jl_gc_page_stack_t * ) gc_allocd_scratch ;
24
+ // Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping (padded to avoid false sharing)
25
+ _Atomic(jl_gc_padded_page_stack_t * ) gc_allocd_scratch ;
26
26
// `tid` of mutator thread that triggered GC
27
27
_Atomic(int ) gc_master_tid ;
28
28
// `tid` of first GC thread
@@ -1593,8 +1593,72 @@ static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_
1593
1593
pg -> nfree = nfree ;
1594
1594
}
1595
1595
1596
- void gc_sweep_wake_all (void )
1596
+ // pre-scan pages to check whether there are enough pages so that's worth parallelizing
1597
+ // also sweeps pages that don't need to be linearly scanned
1598
+ int gc_sweep_prescan (jl_ptls_t ptls , jl_gc_padded_page_stack_t * new_gc_allocd_scratch )
1597
1599
{
1600
+ // 4MB worth of pages is worth parallelizing
1601
+ const int n_pages_worth_parallel_sweep = (int )(4 * (1 << 20 ) / GC_PAGE_SZ );
1602
+ int n_pages_to_scan = 0 ;
1603
+ gc_page_profiler_serializer_t serializer = gc_page_serializer_create ();
1604
+ for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
1605
+ jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1606
+ if (ptls2 == NULL ) {
1607
+ continue ;
1608
+ }
1609
+ jl_gc_page_stack_t * dest = & new_gc_allocd_scratch [ptls2 -> tid ].stack ;
1610
+ jl_gc_page_stack_t tmp ;
1611
+ jl_gc_pagemeta_t * tail = NULL ;
1612
+ memset (& tmp , 0 , sizeof (tmp ));
1613
+ while (1 ) {
1614
+ jl_gc_pagemeta_t * pg = pop_lf_back_nosync (& ptls2 -> page_metadata_allocd );
1615
+ if (pg == NULL ) {
1616
+ break ;
1617
+ }
1618
+ int should_scan = 1 ;
1619
+ if (!pg -> has_marked ) {
1620
+ should_scan = 0 ;
1621
+ }
1622
+ if (!current_sweep_full && !pg -> has_young ) {
1623
+ assert (!prev_sweep_full || pg -> prev_nold >= pg -> nold );
1624
+ if (!prev_sweep_full || pg -> prev_nold == pg -> nold ) {
1625
+ should_scan = 0 ;
1626
+ }
1627
+ }
1628
+ if (should_scan ) {
1629
+ if (tail == NULL ) {
1630
+ tail = pg ;
1631
+ }
1632
+ n_pages_to_scan ++ ;
1633
+ push_lf_back_nosync (& tmp , pg );
1634
+ }
1635
+ else {
1636
+ gc_sweep_pool_page (& serializer , dest , & ptls2 -> page_metadata_buffered , pg );
1637
+ }
1638
+ if (n_pages_to_scan >= n_pages_worth_parallel_sweep ) {
1639
+ break ;
1640
+ }
1641
+ }
1642
+ if (tail != NULL ) {
1643
+ tail -> next = jl_atomic_load_relaxed (& ptls2 -> page_metadata_allocd .bottom );
1644
+ }
1645
+ ptls2 -> page_metadata_allocd = tmp ;
1646
+ if (n_pages_to_scan >= n_pages_worth_parallel_sweep ) {
1647
+ break ;
1648
+ }
1649
+ }
1650
+ gc_page_serializer_destroy (& serializer );
1651
+ return n_pages_to_scan >= n_pages_worth_parallel_sweep ;
1652
+ }
1653
+
1654
+ // wake up all threads to sweep the pages
1655
+ void gc_sweep_wake_all (jl_ptls_t ptls , jl_gc_padded_page_stack_t * new_gc_allocd_scratch )
1656
+ {
1657
+ int parallel_sweep_worthwhile = gc_sweep_prescan (ptls , new_gc_allocd_scratch );
1658
+ jl_atomic_store (& gc_allocd_scratch , new_gc_allocd_scratch );
1659
+ if (!parallel_sweep_worthwhile ) {
1660
+ return ;
1661
+ }
1598
1662
uv_mutex_lock (& gc_threads_lock );
1599
1663
for (int i = gc_first_tid ; i < gc_first_tid + jl_n_markthreads ; i ++ ) {
1600
1664
jl_ptls_t ptls2 = gc_all_tls_states [i ];
@@ -1604,6 +1668,7 @@ void gc_sweep_wake_all(void)
1604
1668
uv_mutex_unlock (& gc_threads_lock );
1605
1669
}
1606
1670
1671
+ // wait for all threads to finish sweeping
1607
1672
void gc_sweep_wait_for_all (void )
1608
1673
{
1609
1674
jl_atomic_store (& gc_allocd_scratch , NULL );
@@ -1612,36 +1677,58 @@ void gc_sweep_wait_for_all(void)
1612
1677
}
1613
1678
}
1614
1679
1615
- void gc_sweep_pool_parallel (void )
1680
+ // sweep all pools
1681
+ void gc_sweep_pool_parallel (jl_ptls_t ptls )
1616
1682
{
1617
1683
jl_atomic_fetch_add (& gc_n_threads_sweeping , 1 );
1618
- jl_gc_page_stack_t * allocd_scratch = jl_atomic_load (& gc_allocd_scratch );
1684
+ jl_gc_padded_page_stack_t * allocd_scratch = jl_atomic_load (& gc_allocd_scratch );
1619
1685
if (allocd_scratch != NULL ) {
1620
1686
gc_page_profiler_serializer_t serializer = gc_page_serializer_create ();
1621
1687
while (1 ) {
1622
1688
int found_pg = 0 ;
1689
+ // sequentially walk the threads and sweep the pages
1623
1690
for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
1624
1691
jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1692
+ // skip foreign threads that already exited
1625
1693
if (ptls2 == NULL ) {
1626
1694
continue ;
1627
1695
}
1628
- jl_gc_page_stack_t * allocd = & allocd_scratch [t_i ];
1629
- jl_gc_pagemeta_t * pg = pop_lf_back (& ptls2 -> page_metadata_allocd );
1696
+ jl_gc_page_stack_t * dest = & allocd_scratch [ptls2 -> tid ].stack ;
1697
+ jl_gc_pagemeta_t * pg = try_pop_lf_back (& ptls2 -> page_metadata_allocd );
1698
+ // failed steal attempt
1630
1699
if (pg == NULL ) {
1631
1700
continue ;
1632
1701
}
1633
- gc_sweep_pool_page (& serializer , allocd , & ptls2 -> page_metadata_buffered , pg );
1702
+ gc_sweep_pool_page (& serializer , dest , & ptls2 -> page_metadata_buffered , pg );
1634
1703
found_pg = 1 ;
1635
1704
}
1636
1705
if (!found_pg ) {
1637
- break ;
1706
+ // check for termination
1707
+ int no_more_work = 1 ;
1708
+ for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
1709
+ jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1710
+ // skip foreign threads that already exited
1711
+ if (ptls2 == NULL ) {
1712
+ continue ;
1713
+ }
1714
+ jl_gc_pagemeta_t * pg = jl_atomic_load_relaxed (& ptls2 -> page_metadata_allocd .bottom );
1715
+ if (pg != NULL ) {
1716
+ no_more_work = 0 ;
1717
+ break ;
1718
+ }
1719
+ }
1720
+ if (no_more_work ) {
1721
+ break ;
1722
+ }
1638
1723
}
1724
+ jl_cpu_pause ();
1639
1725
}
1640
1726
gc_page_serializer_destroy (& serializer );
1641
1727
}
1642
1728
jl_atomic_fetch_add (& gc_n_threads_sweeping , -1 );
1643
1729
}
1644
1730
1731
+ // free all pages (i.e. through `madvise` on Linux) that were lazily freed
1645
1732
void gc_free_pages (void )
1646
1733
{
1647
1734
while (1 ) {
@@ -1666,7 +1753,7 @@ static void gc_sweep_pool(void)
1666
1753
1667
1754
// allocate enough space to hold the end of the free list chain
1668
1755
// for every thread and pool size
1669
- jl_taggedvalue_t * * * pfl = (jl_taggedvalue_t * * * ) alloca (n_threads * JL_GC_N_POOLS * sizeof (jl_taggedvalue_t * * ));
1756
+ jl_taggedvalue_t * * * pfl = (jl_taggedvalue_t * * * ) malloc_s (n_threads * JL_GC_N_POOLS * sizeof (jl_taggedvalue_t * * ));
1670
1757
1671
1758
// update metadata of pages that were pointed to by freelist or newpages from a pool
1672
1759
// i.e. pages being the current allocation target
@@ -1708,17 +1795,18 @@ static void gc_sweep_pool(void)
1708
1795
}
1709
1796
1710
1797
// the actual sweeping
1711
- jl_gc_page_stack_t * tmp = (jl_gc_page_stack_t * )alloca (n_threads * sizeof (jl_gc_page_stack_t ));
1712
- memset (tmp , 0 , n_threads * sizeof (jl_gc_page_stack_t ));
1713
- jl_atomic_store ( & gc_allocd_scratch , tmp ) ;
1714
- gc_sweep_wake_all ();
1715
- gc_sweep_pool_parallel ();
1798
+ jl_gc_padded_page_stack_t * new_gc_allocd_scratch = (jl_gc_padded_page_stack_t * ) malloc_s (n_threads * sizeof (jl_gc_padded_page_stack_t ));
1799
+ memset (new_gc_allocd_scratch , 0 , n_threads * sizeof (jl_gc_padded_page_stack_t ));
1800
+ jl_ptls_t ptls = jl_current_task -> ptls ;
1801
+ gc_sweep_wake_all (ptls , new_gc_allocd_scratch );
1802
+ gc_sweep_pool_parallel (ptls );
1716
1803
gc_sweep_wait_for_all ();
1717
1804
1805
+ // reset half-pages pointers
1718
1806
for (int t_i = 0 ; t_i < n_threads ; t_i ++ ) {
1719
1807
jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1720
1808
if (ptls2 != NULL ) {
1721
- ptls2 -> page_metadata_allocd = tmp [t_i ];
1809
+ ptls2 -> page_metadata_allocd = new_gc_allocd_scratch [t_i ]. stack ;
1722
1810
for (int i = 0 ; i < JL_GC_N_POOLS ; i ++ ) {
1723
1811
jl_gc_pool_t * p = & ptls2 -> heap .norm_pools [i ];
1724
1812
p -> newpages = NULL ;
@@ -1756,6 +1844,10 @@ static void gc_sweep_pool(void)
1756
1844
}
1757
1845
}
1758
1846
1847
+ // cleanup
1848
+ free (pfl );
1849
+ free (new_gc_allocd_scratch );
1850
+
1759
1851
#ifdef _P64 // only enable concurrent sweeping on 64bit
1760
1852
// wake thread up to sweep concurrently
1761
1853
if (jl_n_sweepthreads > 0 ) {
0 commit comments