Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mark phase prefetching. #73375

Merged
merged 5 commits into from
Aug 9, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
222 changes: 192 additions & 30 deletions src/coreclr/gc/gc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2788,6 +2788,8 @@ size_t gc_heap::expand_mechanisms_per_heap[max_expand_mechanisms_count];

size_t gc_heap::interesting_mechanism_bits_per_heap[max_gc_mechanism_bits_count];

mark_queue_t gc_heap::mark_queue;

#endif // MULTIPLE_HEAPS

/* end of per heap static initialization */
Expand Down Expand Up @@ -23203,24 +23205,19 @@ inline
BOOL gc_heap::gc_mark (uint8_t* o, uint8_t* low, uint8_t* high, int condemned_gen)
{
#ifdef USE_REGIONS
assert (low == 0);
assert (high == 0);
if (is_in_heap_range (o))
if ((o >= low) && (o < high))
{
BOOL already_marked = marked (o);
if (already_marked)
return FALSE;
if (condemned_gen == max_generation)
if (condemned_gen != max_generation && get_region_gen_num (o) > condemned_gen)
{
set_marked (o);
return TRUE;
return FALSE;
}
int gen = get_region_gen_num (o);
if (gen <= condemned_gen)
BOOL already_marked = marked (o);
if (already_marked)
{
set_marked (o);
return TRUE;
return FALSE;
}
set_marked (o);
return TRUE;
}
return FALSE;
#else //USE_REGIONS
Expand Down Expand Up @@ -23543,14 +23540,21 @@ void gc_heap::save_post_plug_info (uint8_t* last_pinned_plug, uint8_t* last_obje
}
}

//#define PREFETCH
#define PREFETCH
#ifdef PREFETCH
__declspec(naked) void __fastcall Prefetch(void* addr)
inline void Prefetch(void* addr)
{
__asm {
PREFETCHT0 [ECX]
ret
};
#ifdef TARGET_AMD64

#ifndef _MM_HINT_T0
#define _MM_HINT_T0 1
#endif
_mm_prefetch((const char*)addr, _MM_HINT_T0);
#elif defined(TARGET_ARM64) && defined(TARGET_WINDOWS)
__prefetch((const char*)addr);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

__builtin_prefetch should work on non-Windows

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah right, on linux we should use __buildin_prefetch.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks Jan - it looks like calling it with the default arguments should be just fine for our purposes.

#else
UNREFERENCED_PARAMETER(addr);
#endif
}
#else //PREFETCH
inline void Prefetch (void* addr)
Expand Down Expand Up @@ -23606,6 +23610,114 @@ BOOL ref_p (uint8_t* r)
return (straight_ref_p (r) || partial_object_p (r));
}

mark_queue_t::mark_queue_t() : curr_slot_index(0)
{
for (size_t i = 0; i < slot_count; i++)
{
slot_table[i] = nullptr;
}
}

// place an object in the mark queue
// returns a *different* object or nullptr
// if a non-null object is returned, that object is newly marked
// object o *must* be in a condemned generation
FORCEINLINE
uint8_t *mark_queue_t::queue_mark(uint8_t *o)
{
Prefetch (o);

// while the prefetch is taking effect, park our object in the queue
// and fetch an object that has been sitting in the queue for a while
// and where (hopefully) the memory is already in the cache
size_t slot_index = curr_slot_index;
uint8_t* old_o = slot_table[slot_index];
slot_table[slot_index] = o;

curr_slot_index = (slot_index + 1) % slot_count;
if (old_o == nullptr)
return nullptr;

// this causes us to access the method table pointer of the old object
BOOL already_marked = marked (old_o);
if (already_marked)
{
return nullptr;
}
set_marked (old_o);
return old_o;
}

// place an object in the mark queue
// returns a *different* object or nullptr
// if a non-null object is returned, that object is newly marked
// check first whether the object o is indeed in a condemned generation
FORCEINLINE
uint8_t *mark_queue_t::queue_mark(uint8_t *o, int condemned_gen)
{
#ifdef USE_REGIONS
if (!is_in_heap_range (o))
{
return nullptr;
}
if (condemned_gen != max_generation && gc_heap::get_region_gen_num (o) > condemned_gen)
{
return nullptr;
}
return queue_mark(o);
#else //USE_REGIONS
assert (condemned_gen == -1);

#ifdef MULTIPLE_HEAPS
if (o)
{
gc_heap* hp = gc_heap::heap_of_gc (o);
assert (hp);
if ((o >= hp->gc_low) && (o < hp->gc_high))
return queue_mark (o);
}
#else //MULTIPLE_HEAPS
if ((o >= gc_heap::gc_low) && (o < gc_heap::gc_high))
return queue_mark (o);
#endif //MULTIPLE_HEAPS
return nullptr;
#endif //USE_REGIONS
}

// retrieve a newly marked object from the queue
// returns nullptr if there is no such object
uint8_t* mark_queue_t::drain()
Copy link
Member

@adityamandaleeka adityamandaleeka Aug 4, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Should this be renamed to something that better implies that it just marks one object and returns it (maybe mark_next or something)? drain implies completely emptying it out IMO.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think of this method as draining but just needs to return if there's still objects to mark. it does drain the slot_table at the end when all slots become null (and that's the end goal, to have all slots become null).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about get_next_marked, slight variation on Aditya's suggestion?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then I would probably do get_next_to_mark since you are getting an object to do the mark work on.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, the object returned is already marked, so get_next_to_mark doesn't seem entirely right either. Can't think of anything better than get_next_marked.

{
size_t slot_index = curr_slot_index;
size_t empty_slot_count = 0;
while (empty_slot_count < slot_count)
{
uint8_t* o = slot_table[slot_index];
slot_table[slot_index] = nullptr;
slot_index = (slot_index + 1) % slot_count;
if (o != nullptr)
{
BOOL already_marked = marked (o);
if (!already_marked)
{
set_marked (o);
curr_slot_index = slot_index;
return o;
}
}
empty_slot_count++;
}
return nullptr;
}

mark_queue_t::~mark_queue_t()
{
for (size_t slot_index = 0; slot_index < slot_count; slot_index++)
{
assert(slot_table[slot_index] == nullptr);
}
}

void gc_heap::mark_object_simple1 (uint8_t* oo, uint8_t* start THREAD_NUMBER_DCL)
{
SERVER_SC_MARK_VOLATILE(uint8_t*)* mark_stack_tos = (SERVER_SC_MARK_VOLATILE(uint8_t*)*)mark_stack_array;
Expand Down Expand Up @@ -23665,9 +23777,8 @@ void gc_heap::mark_object_simple1 (uint8_t* oo, uint8_t* start THREAD_NUMBER_DCL

go_through_object_cl (method_table(oo), oo, s, ppslot,
{
uint8_t* o = *ppslot;
Prefetch(o);
if (gc_mark (o, gc_low, gc_high, condemned_gen))
uint8_t* o = mark_queue.queue_mark(*ppslot, condemned_gen);
if (o != nullptr)
{
if (full_p)
{
Expand Down Expand Up @@ -23763,9 +23874,8 @@ void gc_heap::mark_object_simple1 (uint8_t* oo, uint8_t* start THREAD_NUMBER_DCL
go_through_object (method_table(oo), oo, s, ppslot,
start, use_start, (oo + s),
{
uint8_t* o = *ppslot;
Prefetch(o);
if (gc_mark (o, gc_low, gc_high,condemned_gen))
uint8_t* o = mark_queue.queue_mark(*ppslot, condemned_gen);
if (o != nullptr)
{
if (full_p)
{
Expand Down Expand Up @@ -24204,16 +24314,17 @@ gc_heap::mark_object_simple (uint8_t** po THREAD_NUMBER_DCL)
snoop_stat.objects_checked_count++;
#endif //SNOOP_STATS

if (gc_mark1 (o))
o = mark_queue.queue_mark (o);
if (o != nullptr)
{
m_boundary (o);
size_t s = size (o);
add_to_promoted_bytes (o, s, thread);
{
go_through_object_cl (method_table(o), o, s, poo,
{
uint8_t* oo = *poo;
if (gc_mark (oo, gc_low, gc_high, condemned_gen))
uint8_t* oo = mark_queue.queue_mark(*poo, condemned_gen);
if (oo != nullptr)
{
m_boundary (oo);
add_to_promoted_bytes (oo, thread);
Expand Down Expand Up @@ -24250,6 +24361,45 @@ void gc_heap::mark_object (uint8_t* o THREAD_NUMBER_DCL)
#endif //USE_REGIONS
}

void gc_heap::drain_mark_queue ()
{
int condemned_gen =
#ifdef USE_REGIONS
settings.condemned_generation;
#else
-1;
#endif //USE_REGIONS

#ifdef MULTIPLE_HEAPS
THREAD_FROM_HEAP;
#else
const int thread = 0;
#endif //MULTIPLE_HEAPS

uint8_t* o;
while ((o = mark_queue.drain()) != nullptr)
{
m_boundary (o);
size_t s = size (o);
add_to_promoted_bytes (o, s, thread);
if (contain_pointers_or_collectible (o))
{
go_through_object_cl (method_table(o), o, s, poo,
{
uint8_t* oo = mark_queue.queue_mark(*poo, condemned_gen);
if (oo != nullptr)
{
m_boundary (oo);
add_to_promoted_bytes (oo, thread);
if (contain_pointers_or_collectible (oo))
mark_object_simple1 (oo, oo THREAD_NUMBER_ARG);
}
}
);
}
}
}

#ifdef BACKGROUND_GC

#ifdef USE_REGIONS
Expand Down Expand Up @@ -25426,6 +25576,8 @@ void gc_heap::scan_dependent_handles (int condemned_gen_number, ScanContext *sc,
if (GCScan::GcDhUnpromotedHandlesExist(sc))
s_fUnpromotedHandles = TRUE;

drain_mark_queue();

// Synchronize all the threads so we can read our state variables safely. The shared variable
// s_fScanRequired, indicating whether we should scan the tables or terminate the loop, will be set by
// a single thread inside the join.
Expand Down Expand Up @@ -25844,6 +25996,7 @@ void gc_heap::mark_phase (int condemned_gen_number, BOOL mark_only_p)
if ((condemned_gen_number == max_generation) && (num_sizedrefs > 0))
{
GCScan::GcScanSizedRefs(GCHeap::Promote, condemned_gen_number, max_generation, &sc);
drain_mark_queue();
fire_mark_event (ETW::GC_ROOT_SIZEDREF, current_promoted_bytes, last_promoted_bytes);

#ifdef MULTIPLE_HEAPS
Expand All @@ -25867,26 +26020,30 @@ void gc_heap::mark_phase (int condemned_gen_number, BOOL mark_only_p)
GCScan::GcScanRoots(GCHeap::Promote,
condemned_gen_number, max_generation,
&sc);
drain_mark_queue();
fire_mark_event (ETW::GC_ROOT_STACK, current_promoted_bytes, last_promoted_bytes);

#ifdef BACKGROUND_GC
if (gc_heap::background_running_p())
{
scan_background_roots (GCHeap::Promote, heap_number, &sc);
drain_mark_queue();
fire_mark_event (ETW::GC_ROOT_BGC, current_promoted_bytes, last_promoted_bytes);
}
#endif //BACKGROUND_GC

#ifdef FEATURE_PREMORTEM_FINALIZATION
dprintf(3, ("Marking finalization data"));
finalize_queue->GcScanRoots(GCHeap::Promote, heap_number, 0);
drain_mark_queue();
fire_mark_event (ETW::GC_ROOT_FQ, current_promoted_bytes, last_promoted_bytes);
#endif // FEATURE_PREMORTEM_FINALIZATION

dprintf(3,("Marking handle table"));
GCScan::GcScanHandles(GCHeap::Promote,
condemned_gen_number, max_generation,
&sc);
drain_mark_queue();
fire_mark_event (ETW::GC_ROOT_HANDLES, current_promoted_bytes, last_promoted_bytes);

if (!full_p)
Expand Down Expand Up @@ -25998,6 +26155,7 @@ void gc_heap::mark_phase (int condemned_gen_number, BOOL mark_only_p)
update_old_card_survived();
#endif //USE_REGIONS

drain_mark_queue();
fire_mark_event (ETW::GC_ROOT_OLDER, current_promoted_bytes, last_promoted_bytes);
}
}
Expand All @@ -26006,6 +26164,7 @@ void gc_heap::mark_phase (int condemned_gen_number, BOOL mark_only_p)
if (do_mark_steal_p)
{
mark_steal();
drain_mark_queue();
fire_mark_event (ETW::GC_ROOT_STEAL, current_promoted_bytes, last_promoted_bytes);
}
#endif //MH_SC_MARK
Expand All @@ -26019,6 +26178,7 @@ void gc_heap::mark_phase (int condemned_gen_number, BOOL mark_only_p)
// handle table has been fully promoted.
GCScan::GcDhInitialScan(GCHeap::Promote, condemned_gen_number, max_generation, &sc);
scan_dependent_handles(condemned_gen_number, &sc, true);
drain_mark_queue();
fire_mark_event (ETW::GC_ROOT_DH_HANDLES, current_promoted_bytes, last_promoted_bytes);

#ifdef MULTIPLE_HEAPS
Expand Down Expand Up @@ -26101,12 +26261,14 @@ void gc_heap::mark_phase (int condemned_gen_number, BOOL mark_only_p)
#ifdef FEATURE_PREMORTEM_FINALIZATION
dprintf (3, ("Finalize marking"));
finalize_queue->ScanForFinalization (GCHeap::Promote, condemned_gen_number, mark_only_p, __this);
drain_mark_queue();
fire_mark_event (ETW::GC_ROOT_NEW_FQ, current_promoted_bytes, last_promoted_bytes);
GCToEEInterface::DiagWalkFReachableObjects(__this);

// Scan dependent handles again to promote any secondaries associated with primaries that were promoted
// for finalization. As before scan_dependent_handles will also process any mark stack overflow.
scan_dependent_handles(condemned_gen_number, &sc, false);
drain_mark_queue();
fire_mark_event (ETW::GC_ROOT_DH_HANDLES, current_promoted_bytes, last_promoted_bytes);
#endif //FEATURE_PREMORTEM_FINALIZATION

Expand Down Expand Up @@ -31575,7 +31737,7 @@ uint8_t* tree_search (uint8_t* tree, uint8_t* old_address)
assert (candidate < tree);
candidate = tree;
tree = tree + cn;
Prefetch (tree - 8);
Prefetch (&((plug_and_pair*)tree)[-1].m_pair.left);
continue;
}
else
Expand All @@ -31586,7 +31748,7 @@ uint8_t* tree_search (uint8_t* tree, uint8_t* old_address)
if ((cn = node_left_child (tree)) != 0)
{
tree = tree + cn;
Prefetch (tree - 8);
Prefetch (&((plug_and_pair*)tree)[-1].m_pair.left);
continue;
}
else
Expand Down
Loading