-
Notifications
You must be signed in to change notification settings - Fork 1.1k
fix: make FreeMemWithEvictionStep atomic #4885
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1332,8 +1332,12 @@ int32_t DbSlice::GetNextSegmentForEviction(int32_t segment_id, DbIndex db_ind) c | |
| db_arr_[db_ind]->prime.GetSegmentCount(); | ||
| } | ||
|
|
||
| pair<uint64_t, size_t> DbSlice::FreeMemWithEvictionStep(DbIndex db_ind, size_t starting_segment_id, | ||
| size_t increase_goal_bytes) { | ||
| pair<uint64_t, size_t> DbSlice::FreeMemWithEvictionStepAtomic(DbIndex db_ind, | ||
| size_t starting_segment_id, | ||
| size_t increase_goal_bytes) { | ||
| // Disable flush journal changes to prevent preemtion | ||
| journal::JournalFlushGuard journal_flush_guard(shard_owner()->journal()); | ||
| FiberAtomicGuard guard; | ||
| DCHECK(!owner_->IsReplica()); | ||
|
|
||
| size_t evicted_items = 0, evicted_bytes = 0; | ||
|
|
@@ -1360,60 +1364,55 @@ pair<uint64_t, size_t> DbSlice::FreeMemWithEvictionStep(DbIndex db_ind, size_t s | |
| bool record_keys = owner_->journal() != nullptr || expired_keys_events_recording_; | ||
| vector<string> keys_to_journal; | ||
|
|
||
| { | ||
| FiberAtomicGuard guard; | ||
| for (int32_t slot_id = num_slots - 1; slot_id >= 0; --slot_id) { | ||
| for (int32_t bucket_id = num_buckets - 1; bucket_id >= 0; --bucket_id) { | ||
| // pick a random segment to start with in each eviction, | ||
| // as segment_id does not imply any recency, and random selection should be fair enough | ||
| int32_t segment_id = starting_segment_id; | ||
| for (size_t num_seg_visited = 0; num_seg_visited < max_segment_to_consider; | ||
| ++num_seg_visited, segment_id = GetNextSegmentForEviction(segment_id, db_ind)) { | ||
| const auto& bucket = db_table->prime.GetSegment(segment_id)->GetBucket(bucket_id); | ||
| if (bucket.IsEmpty()) | ||
| continue; | ||
|
|
||
| if (!bucket.IsBusy(slot_id)) | ||
| continue; | ||
|
|
||
| auto evict_it = db_table->prime.GetIterator(segment_id, bucket_id, slot_id); | ||
| if (evict_it->first.IsSticky() || !evict_it->second.HasAllocated()) | ||
| continue; | ||
|
|
||
| // check if the key is locked by looking up transaction table. | ||
| const auto& lt = db_table->trans_locks; | ||
| string_view key = evict_it->first.GetSlice(&tmp); | ||
| if (lt.Find(LockTag(key)).has_value()) | ||
| continue; | ||
|
|
||
| if (record_keys) | ||
| keys_to_journal.emplace_back(key); | ||
|
|
||
| evicted_bytes += evict_it->first.MallocUsed() + evict_it->second.MallocUsed(); | ||
| ++evicted_items; | ||
| PerformDeletion(Iterator(evict_it, StringOrView::FromView(key)), db_table.get()); | ||
|
|
||
| // returns when whichever condition is met first | ||
| if ((evicted_items == max_eviction_per_hb) || (evicted_bytes >= increase_goal_bytes)) | ||
| goto finish; | ||
| } | ||
| for (int32_t slot_id = num_slots - 1; slot_id >= 0; --slot_id) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No functional changes in this block of code. I moved the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @adiholden I can also move back the |
||
| for (int32_t bucket_id = num_buckets - 1; bucket_id >= 0; --bucket_id) { | ||
| // pick a random segment to start with in each eviction, | ||
| // as segment_id does not imply any recency, and random selection should be fair enough | ||
| int32_t segment_id = starting_segment_id; | ||
| for (size_t num_seg_visited = 0; num_seg_visited < max_segment_to_consider; | ||
| ++num_seg_visited, segment_id = GetNextSegmentForEviction(segment_id, db_ind)) { | ||
| const auto& bucket = db_table->prime.GetSegment(segment_id)->GetBucket(bucket_id); | ||
| if (bucket.IsEmpty() || !bucket.IsBusy(slot_id)) | ||
| continue; | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Only difference is I combined those two statements |
||
|
|
||
| auto evict_it = db_table->prime.GetIterator(segment_id, bucket_id, slot_id); | ||
| if (evict_it->first.IsSticky() || !evict_it->second.HasAllocated()) | ||
| continue; | ||
|
|
||
| // check if the key is locked by looking up transaction table. | ||
| const auto& lt = db_table->trans_locks; | ||
| string_view key = evict_it->first.GetSlice(&tmp); | ||
| if (lt.Find(LockTag(key)).has_value()) | ||
| continue; | ||
|
|
||
| if (record_keys) | ||
| keys_to_journal.emplace_back(key); | ||
|
|
||
| evicted_bytes += evict_it->first.MallocUsed() + evict_it->second.MallocUsed(); | ||
| ++evicted_items; | ||
| PerformDeletion(Iterator(evict_it, StringOrView::FromView(key)), db_table.get()); | ||
|
|
||
| // returns when whichever condition is met first | ||
| if ((evicted_items == max_eviction_per_hb) || (evicted_bytes >= increase_goal_bytes)) | ||
| goto finish; | ||
| } | ||
| } | ||
| } // FiberAtomicGuard | ||
| } | ||
|
|
||
| finish: | ||
|
|
||
| // send the deletion to the replicas. | ||
| // fiber preemption could happen in this phase. | ||
| for (string_view key : keys_to_journal) { | ||
| if (auto journal = owner_->journal(); journal) | ||
| // Won't block because we disabled journal flushing. See first line of this function. | ||
| RecordExpiryBlocking(db_ind, key); | ||
|
|
||
| if (expired_keys_events_recording_) | ||
| db_table->expired_keys_events_.emplace_back(key); | ||
| } | ||
|
|
||
| SendQueuedInvalidationMessages(); | ||
| // This might not always be atomic on exceptional cases -- see comments on the function | ||
| // declaration. | ||
| SendQueuedInvalidationMessagesAsync(); | ||
| auto time_finish = absl::GetCurrentTimeNanos(); | ||
| events_.evicted_keys += evicted_items; | ||
| DVLOG(2) << "Eviction time (us): " << (time_finish - time_start) / 1000; | ||
|
|
@@ -1529,33 +1528,50 @@ void DbSlice::QueueInvalidationTrackingMessageAtomic(std::string_view key) { | |
| } | ||
| } | ||
|
|
||
| void DbSlice::SendQueuedInvalidationMessagesCb(const TrackingMap& track_map, unsigned idx) const { | ||
| for (auto& [key, client_list] : track_map) { | ||
| for (auto& client : client_list) { | ||
| if (client.IsExpired() || (client.Thread() != idx)) { | ||
| continue; | ||
| } | ||
| auto* conn = client.Get(); | ||
| auto* cntx = static_cast<ConnectionContext*>(conn->cntx()); | ||
| if (cntx && cntx->conn_state.tracking_info_.IsTrackingOn()) { | ||
| conn->SendInvalidationMessageAsync({key}); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| void DbSlice::SendQueuedInvalidationMessages() { | ||
| // We run while loop because when we block below, we might have new items added to | ||
| // pending_send_map_. | ||
| while (!pending_send_map_.empty()) { | ||
| auto local_map = std::move(pending_send_map_); | ||
|
|
||
| // Notify all the clients. this function is not efficient, | ||
| // because it broadcasts to all threads unrelated to the subscribers for the key. | ||
| auto local_map = std::move(pending_send_map_); | ||
| auto cb = [&](unsigned idx, util::ProactorBase*) { | ||
| for (auto& [key, client_list] : local_map) { | ||
| for (auto& client : client_list) { | ||
| if (client.IsExpired() || (client.Thread() != idx)) { | ||
| continue; | ||
| } | ||
| auto* conn = client.Get(); | ||
| auto* cntx = static_cast<ConnectionContext*>(conn->cntx()); | ||
| if (cntx && cntx->conn_state.tracking_info_.IsTrackingOn()) { | ||
| conn->SendInvalidationMessageAsync({key}); | ||
| } | ||
| } | ||
| } | ||
| SendQueuedInvalidationMessagesCb(local_map, idx); | ||
| }; | ||
|
|
||
| shard_set->pool()->AwaitBrief(std::move(cb)); | ||
| } | ||
| } | ||
|
|
||
| // This function might preempt if the task queue within DispatchBrief is full and we can't | ||
| // enqueue the callback. Although a rare case, this code might not be atomic. | ||
| void DbSlice::SendQueuedInvalidationMessagesAsync() { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We drained it before because we blocked. Now we don't. TODO investigate: we should call this also from heartbeat (so on the second iteration it drains it if there were items added).
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After looking at this the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As for DispatchBrief, I also don't see any issue whatsoever, they messages will reach the connections eventually and it should be enough
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. DispatchBrief can also block btw, it's just a much rare event
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @romange the rationale was this doesn't happen in practice because we don't reach this state easily (where we have the task queues internally full). Your comment is very valid, and maybe we should move this outside of the non preemptive critical section. I think it should be a simple change. I will update on this
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @romange would it worth to add a function that won't dispatch if the task queue is full (but also won't preempt and return to the caller?) That way, we can send the pending_items later if we can't dispatch because the intenral queues are full |
||
| if (pending_send_map_.empty()) { | ||
| return; | ||
| } | ||
| // DispatchBrief will copy local_map | ||
| auto cb = [lm = std::move(pending_send_map_), this](unsigned idx, util::ProactorBase*) { | ||
| SendQueuedInvalidationMessagesCb(lm, idx); | ||
| }; | ||
|
|
||
| shard_set->pool()->DispatchBrief(std::move(cb)); | ||
| } | ||
|
|
||
| void DbSlice::StartSampleTopK(DbIndex db_ind, uint32_t min_freq) { | ||
| auto& db = *db_arr_[db_ind]; | ||
| if (db.top_keys) { | ||
|
|
@@ -1725,9 +1741,8 @@ void DbSlice::OnCbFinishBlocking() { | |
| } | ||
| } | ||
|
|
||
| if (!pending_send_map_.empty()) { | ||
| SendQueuedInvalidationMessages(); | ||
| } | ||
| // Sends only if !pending_send_map_.empty() | ||
| SendQueuedInvalidationMessages(); | ||
| } | ||
|
|
||
| void DbSlice::CallChangeCallbacks(DbIndex id, const ChangeReq& cr) const { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The caller to this function also uses journal::JournalFlushGuard
In this class distructor we do journal_->SetFlushMode(true);
This means that the logic now is broken if there are other calls to journal not inside FreeMemWithEvictionStepAtomic