@@ -21,8 +21,8 @@ int jl_n_sweepthreads;
21
21
_Atomic(int ) gc_n_threads_marking ;
22
22
// Number of threads sweeping
23
23
_Atomic(int ) gc_n_threads_sweeping ;
24
- // Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping
25
- _Atomic(jl_gc_page_stack_t * ) gc_allocd_scratch ;
24
+ // Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping (padded to avoid false sharing)
25
+ _Atomic(jl_gc_padded_page_stack_t * ) gc_allocd_scratch ;
26
26
// `tid` of mutator thread that triggered GC
27
27
_Atomic(int ) gc_master_tid ;
28
28
// `tid` of first GC thread
@@ -1586,8 +1586,72 @@ static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_
1586
1586
pg -> nfree = nfree ;
1587
1587
}
1588
1588
1589
- void gc_sweep_wake_all (void )
1589
+ // pre-scan pages to check whether there are enough pages so that's worth parallelizing
1590
+ // also sweeps pages that don't need to be linearly scanned
1591
+ int gc_sweep_prescan (jl_ptls_t ptls , jl_gc_padded_page_stack_t * new_gc_allocd_scratch )
1590
1592
{
1593
+ // 4MB worth of pages is worth parallelizing
1594
+ const int n_pages_worth_parallel_sweep = (int )(4 * (1 << 20 ) / GC_PAGE_SZ );
1595
+ int n_pages_to_scan = 0 ;
1596
+ gc_page_profiler_serializer_t serializer = gc_page_serializer_create ();
1597
+ for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
1598
+ jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1599
+ if (ptls2 == NULL ) {
1600
+ continue ;
1601
+ }
1602
+ jl_gc_page_stack_t * dest = & new_gc_allocd_scratch [ptls2 -> tid ].stack ;
1603
+ jl_gc_page_stack_t tmp ;
1604
+ jl_gc_pagemeta_t * tail = NULL ;
1605
+ memset (& tmp , 0 , sizeof (tmp ));
1606
+ while (1 ) {
1607
+ jl_gc_pagemeta_t * pg = pop_lf_back_nosync (& ptls2 -> page_metadata_allocd );
1608
+ if (pg == NULL ) {
1609
+ break ;
1610
+ }
1611
+ int should_scan = 1 ;
1612
+ if (!pg -> has_marked ) {
1613
+ should_scan = 0 ;
1614
+ }
1615
+ if (!current_sweep_full && !pg -> has_young ) {
1616
+ assert (!prev_sweep_full || pg -> prev_nold >= pg -> nold );
1617
+ if (!prev_sweep_full || pg -> prev_nold == pg -> nold ) {
1618
+ should_scan = 0 ;
1619
+ }
1620
+ }
1621
+ if (should_scan ) {
1622
+ if (tail == NULL ) {
1623
+ tail = pg ;
1624
+ }
1625
+ n_pages_to_scan ++ ;
1626
+ push_lf_back_nosync (& tmp , pg );
1627
+ }
1628
+ else {
1629
+ gc_sweep_pool_page (& serializer , dest , & ptls2 -> page_metadata_buffered , pg );
1630
+ }
1631
+ if (n_pages_to_scan >= n_pages_worth_parallel_sweep ) {
1632
+ break ;
1633
+ }
1634
+ }
1635
+ if (tail != NULL ) {
1636
+ tail -> next = jl_atomic_load_relaxed (& ptls2 -> page_metadata_allocd .bottom );
1637
+ }
1638
+ ptls2 -> page_metadata_allocd = tmp ;
1639
+ if (n_pages_to_scan >= n_pages_worth_parallel_sweep ) {
1640
+ break ;
1641
+ }
1642
+ }
1643
+ gc_page_serializer_destroy (& serializer );
1644
+ return n_pages_to_scan >= n_pages_worth_parallel_sweep ;
1645
+ }
1646
+
1647
+ // wake up all threads to sweep the pages
1648
+ void gc_sweep_wake_all (jl_ptls_t ptls , jl_gc_padded_page_stack_t * new_gc_allocd_scratch )
1649
+ {
1650
+ int parallel_sweep_worthwhile = gc_sweep_prescan (ptls , new_gc_allocd_scratch );
1651
+ jl_atomic_store (& gc_allocd_scratch , new_gc_allocd_scratch );
1652
+ if (!parallel_sweep_worthwhile ) {
1653
+ return ;
1654
+ }
1591
1655
uv_mutex_lock (& gc_threads_lock );
1592
1656
for (int i = gc_first_tid ; i < gc_first_tid + jl_n_markthreads ; i ++ ) {
1593
1657
jl_ptls_t ptls2 = gc_all_tls_states [i ];
@@ -1597,6 +1661,7 @@ void gc_sweep_wake_all(void)
1597
1661
uv_mutex_unlock (& gc_threads_lock );
1598
1662
}
1599
1663
1664
+ // wait for all threads to finish sweeping
1600
1665
void gc_sweep_wait_for_all (void )
1601
1666
{
1602
1667
jl_atomic_store (& gc_allocd_scratch , NULL );
@@ -1605,36 +1670,58 @@ void gc_sweep_wait_for_all(void)
1605
1670
}
1606
1671
}
1607
1672
1608
- void gc_sweep_pool_parallel (void )
1673
+ // sweep all pools
1674
+ void gc_sweep_pool_parallel (jl_ptls_t ptls )
1609
1675
{
1610
1676
jl_atomic_fetch_add (& gc_n_threads_sweeping , 1 );
1611
- jl_gc_page_stack_t * allocd_scratch = jl_atomic_load (& gc_allocd_scratch );
1677
+ jl_gc_padded_page_stack_t * allocd_scratch = jl_atomic_load (& gc_allocd_scratch );
1612
1678
if (allocd_scratch != NULL ) {
1613
1679
gc_page_profiler_serializer_t serializer = gc_page_serializer_create ();
1614
1680
while (1 ) {
1615
1681
int found_pg = 0 ;
1682
+ // sequentially walk the threads and sweep the pages
1616
1683
for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
1617
1684
jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1685
+ // skip foreign threads that already exited
1618
1686
if (ptls2 == NULL ) {
1619
1687
continue ;
1620
1688
}
1621
- jl_gc_page_stack_t * allocd = & allocd_scratch [t_i ];
1622
- jl_gc_pagemeta_t * pg = pop_lf_back (& ptls2 -> page_metadata_allocd );
1689
+ jl_gc_page_stack_t * dest = & allocd_scratch [ptls2 -> tid ].stack ;
1690
+ jl_gc_pagemeta_t * pg = try_pop_lf_back (& ptls2 -> page_metadata_allocd );
1691
+ // failed steal attempt
1623
1692
if (pg == NULL ) {
1624
1693
continue ;
1625
1694
}
1626
- gc_sweep_pool_page (& serializer , allocd , & ptls2 -> page_metadata_buffered , pg );
1695
+ gc_sweep_pool_page (& serializer , dest , & ptls2 -> page_metadata_buffered , pg );
1627
1696
found_pg = 1 ;
1628
1697
}
1629
1698
if (!found_pg ) {
1630
- break ;
1699
+ // check for termination
1700
+ int no_more_work = 1 ;
1701
+ for (int t_i = 0 ; t_i < gc_n_threads ; t_i ++ ) {
1702
+ jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1703
+ // skip foreign threads that already exited
1704
+ if (ptls2 == NULL ) {
1705
+ continue ;
1706
+ }
1707
+ jl_gc_pagemeta_t * pg = jl_atomic_load_relaxed (& ptls2 -> page_metadata_allocd .bottom );
1708
+ if (pg != NULL ) {
1709
+ no_more_work = 0 ;
1710
+ break ;
1711
+ }
1712
+ }
1713
+ if (no_more_work ) {
1714
+ break ;
1715
+ }
1631
1716
}
1717
+ jl_cpu_pause ();
1632
1718
}
1633
1719
gc_page_serializer_destroy (& serializer );
1634
1720
}
1635
1721
jl_atomic_fetch_add (& gc_n_threads_sweeping , -1 );
1636
1722
}
1637
1723
1724
+ // free all pages (i.e. through `madvise` on Linux) that were lazily freed
1638
1725
void gc_free_pages (void )
1639
1726
{
1640
1727
while (1 ) {
@@ -1659,7 +1746,7 @@ static void gc_sweep_pool(void)
1659
1746
1660
1747
// allocate enough space to hold the end of the free list chain
1661
1748
// for every thread and pool size
1662
- jl_taggedvalue_t * * * pfl = (jl_taggedvalue_t * * * ) alloca (n_threads * JL_GC_N_POOLS * sizeof (jl_taggedvalue_t * * ));
1749
+ jl_taggedvalue_t * * * pfl = (jl_taggedvalue_t * * * ) malloc_s (n_threads * JL_GC_N_POOLS * sizeof (jl_taggedvalue_t * * ));
1663
1750
1664
1751
// update metadata of pages that were pointed to by freelist or newpages from a pool
1665
1752
// i.e. pages being the current allocation target
@@ -1701,17 +1788,18 @@ static void gc_sweep_pool(void)
1701
1788
}
1702
1789
1703
1790
// the actual sweeping
1704
- jl_gc_page_stack_t * tmp = (jl_gc_page_stack_t * )alloca (n_threads * sizeof (jl_gc_page_stack_t ));
1705
- memset (tmp , 0 , n_threads * sizeof (jl_gc_page_stack_t ));
1706
- jl_atomic_store ( & gc_allocd_scratch , tmp ) ;
1707
- gc_sweep_wake_all ();
1708
- gc_sweep_pool_parallel ();
1791
+ jl_gc_padded_page_stack_t * new_gc_allocd_scratch = (jl_gc_padded_page_stack_t * ) malloc_s (n_threads * sizeof (jl_gc_padded_page_stack_t ));
1792
+ memset (new_gc_allocd_scratch , 0 , n_threads * sizeof (jl_gc_padded_page_stack_t ));
1793
+ jl_ptls_t ptls = jl_current_task -> ptls ;
1794
+ gc_sweep_wake_all (ptls , new_gc_allocd_scratch );
1795
+ gc_sweep_pool_parallel (ptls );
1709
1796
gc_sweep_wait_for_all ();
1710
1797
1798
+ // reset half-pages pointers
1711
1799
for (int t_i = 0 ; t_i < n_threads ; t_i ++ ) {
1712
1800
jl_ptls_t ptls2 = gc_all_tls_states [t_i ];
1713
1801
if (ptls2 != NULL ) {
1714
- ptls2 -> page_metadata_allocd = tmp [t_i ];
1802
+ ptls2 -> page_metadata_allocd = new_gc_allocd_scratch [t_i ]. stack ;
1715
1803
for (int i = 0 ; i < JL_GC_N_POOLS ; i ++ ) {
1716
1804
jl_gc_pool_t * p = & ptls2 -> heap .norm_pools [i ];
1717
1805
p -> newpages = NULL ;
@@ -1749,6 +1837,10 @@ static void gc_sweep_pool(void)
1749
1837
}
1750
1838
}
1751
1839
1840
+ // cleanup
1841
+ free (pfl );
1842
+ free (new_gc_allocd_scratch );
1843
+
1752
1844
#ifdef _P64 // only enable concurrent sweeping on 64bit
1753
1845
// wake thread up to sweep concurrently
1754
1846
if (jl_n_sweepthreads > 0 ) {
0 commit comments