Skip to content

Commit

Permalink
Mas i370 deletepending (#377)
Browse files Browse the repository at this point in the history
Previously delete_confirmation was blocked on work_ongoing.

However, if the penciller has a work backlog, work_ongoing may be a recurring problem ... and some files, may remain undeleted long after their use - lifetimes for L0 fails in particular have seen to rise from 10-15s to 5m +.

Letting L0 files linger can have a significant impact on memory. In put-heavy tests (e.g. when testing riak-admin transfers) the memory footprint of a riak node has bene observed peaking more than 80% above normal levels, when compared to using this patch.

This PR allows for deletes to be confirmed even when there is work ongoing, by postponing the updating of the manifest until the manifest is next returned from the clerk.

Co-authored-by: Thomas Arts <thomas.arts@quviq.com>
  • Loading branch information
martinsumner and ThomasArts committed May 24, 2022
1 parent 9e8fcbc commit 2648c9a
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 44 deletions.
4 changes: 3 additions & 1 deletion src/leveled_bookie.erl
Original file line number Diff line number Diff line change
Expand Up @@ -3074,8 +3074,10 @@ is_empty_headonly_test() ->

undefined_rootpath_test() ->
Opts = [{max_journalsize, 1000000}, {cache_size, 500}],
error_logger:tty(false),
R = gen_server:start(?MODULE, [set_defaults(Opts)], []),
?assertMatch({error, no_root_path}, R).
?assertMatch({error, no_root_path}, R),
error_logger:tty(true).


foldkeys_headonly_test() ->
Expand Down
83 changes: 63 additions & 20 deletions src/leveled_penciller.erl
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,9 @@

work_ongoing = false :: boolean(), % i.e. compaction work
work_backlog = false :: boolean(), % i.e. compaction work

pending_removals = [] :: list(string()),
maybe_release = false :: boolean(),

timings = no_timing :: pcl_timings(),
timings_countdown = 0 :: integer(),
Expand Down Expand Up @@ -992,36 +995,76 @@ handle_cast({manifest_change, Manifest}, State) ->
ok =
leveled_pclerk:clerk_promptdeletions(State#state.clerk, NewManSQN),
% This is accepted as the new manifest, files may be deleted
UpdManifest =
UpdManifest0 =
leveled_pmanifest:merge_snapshot(State#state.manifest, Manifest),
% Need to preserve the penciller's view of snapshots stored in
% the manifest
{noreply, State#state{manifest=UpdManifest, work_ongoing=false}}
UpdManifest1 =
leveled_pmanifest:clear_pending(
UpdManifest0,
State#state.pending_removals,
State#state.maybe_release),
{noreply,
State#state{
manifest=UpdManifest1,
pending_removals = [],
maybe_release = false,
work_ongoing=false}}
end;
handle_cast({release_snapshot, Snapshot}, State) ->
Manifest0 = leveled_pmanifest:release_snapshot(State#state.manifest,
Snapshot),
leveled_log:log("P0003", [Snapshot]),
{noreply, State#state{manifest=Manifest0}};
handle_cast({confirm_delete, Filename, FilePid}, State=#state{is_snapshot=Snap})
when Snap == false ->
case State#state.work_ongoing of
false ->
R2D = leveled_pmanifest:ready_to_delete(State#state.manifest,
Filename),
case R2D of
{true, M0} ->
leveled_log:log("P0005", [Filename]),
ok = leveled_sst:sst_deleteconfirmed(FilePid),
{noreply, State#state{manifest=M0}};
{false, _M0} ->
{noreply, State}
end;
handle_cast({confirm_delete, PDFN, FilePid}, State=#state{is_snapshot=Snap})
when Snap == false ->
% This is a two stage process. A file that is ready for deletion can be
% checked against the manifest to prompt the deletion, however it must also
% be removed from the manifest's list of pending deletes. This is only
% possible when the manifest is in control of the penciller not the clerk.
% When work is ongoing (i.e. the manifest is under control of the clerk),
% any removals from the manifest need to be stored temporarily (in
% pending_removals) until such time that the manifest is in control of the
% penciller and can be updated.
% The maybe_release boolean on state is used if any file is not ready to
% delete, and there is work ongoing. This will then trigger a check to
% ensure any timed out snapshots are released, in case this is the factor
% blocking the delete confirmation
% When an updated manifest is submitted by the clerk, the pending_removals
% will be cleared from pending using the maybe_release boolean
case leveled_pmanifest:ready_to_delete(State#state.manifest, PDFN) of
true ->
% If there is ongoing work, then we can't safely update the pidmap
% as any change will be reverted when the manifest is passed back
% from the Clerk
{noreply, State}
leveled_log:log("P0005", [PDFN]),
ok = leveled_sst:sst_deleteconfirmed(FilePid),
case State#state.work_ongoing of
true ->
UpdRemovals =
lists:usort([PDFN|State#state.pending_removals]),
{noreply,
State#state{
pending_removals = UpdRemovals}};
false ->
UpdManifest =
leveled_pmanifest:clear_pending(
State#state.manifest,
[PDFN],
false),
{noreply,
State#state{manifest = UpdManifest}}
end;
false ->
case State#state.work_ongoing of
true ->
{noreply, State#state{maybe_release = true}};
false ->
UpdManifest =
leveled_pmanifest:clear_pending(
State#state.manifest,
[],
true),
{noreply,
State#state{manifest = UpdManifest}}
end
end;
handle_cast({levelzero_complete, FN, StartKey, EndKey, Bloom}, State) ->
leveled_log:log("P0029", []),
Expand Down
64 changes: 41 additions & 23 deletions src/leveled_pmanifest.erl
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
release_snapshot/2,
merge_snapshot/2,
ready_to_delete/2,
clear_pending/3,
check_for_work/2,
is_basement/2,
levelzero_present/1,
Expand Down Expand Up @@ -545,32 +546,41 @@ release_snapshot(Manifest, Pid) ->
min_snapshot_sqn = MinSnapSQN}
end.

-spec ready_to_delete(manifest(), string()) -> {boolean(), manifest()}.

%% @doc
%% A SST file which is in the delete_pending state can check to see if it is
%% ready to delete against the manifest.
%% This does not update the manifest, a call is required to clear_pending to
%% remove the file from the manifest's list of pending_deletes.
-spec ready_to_delete(manifest(), string()) -> boolean().
ready_to_delete(Manifest, Filename) ->
PendingDelete = dict:find(Filename, Manifest#manifest.pending_deletes),

case {PendingDelete, Manifest#manifest.min_snapshot_sqn} of
{{ok, _}, 0} ->
% no shapshots
PDs = dict:erase(Filename, Manifest#manifest.pending_deletes),
{true, Manifest#manifest{pending_deletes = PDs}};
true;
{{ok, {ChangeSQN, _ME}}, N} when N >= ChangeSQN ->
% Every snapshot is looking at a version of history after this
% was removed
PDs = dict:erase(Filename, Manifest#manifest.pending_deletes),
{true, Manifest#manifest{pending_deletes = PDs}};
true;
_ ->
% If failed to delete then we should release a phantom pid
% in case this is necessary to timeout any snapshots
% This wll also trigger a log
% This may also be because of i355 - a race condition when a second
% confirm_delete may be sent prior to the first being processed
{false, release_snapshot(Manifest, ?PHANTOM_PID)}
false
end.

-spec clear_pending(manifest(), list(string()), boolean()) -> manifest().
clear_pending(Manifest, [], true) ->
% If the penciller got a confirm_delete that resulted in no pending
% removals, then maybe we need to timeout a snapshot via release
release_snapshot(Manifest, ?PHANTOM_PID);
clear_pending(Manifest, [], false) ->
Manifest;
clear_pending(Manifest, [FN|RestFN], MaybeRelease) ->
PDs = dict:erase(FN, Manifest#manifest.pending_deletes),
clear_pending(
Manifest#manifest{pending_deletes = PDs},
RestFN,
MaybeRelease).

-spec check_for_work(manifest(), list()) -> {list(), integer()}.
%% @doc
%% Check for compaction work in the manifest - look at levels which contain
Expand Down Expand Up @@ -1308,6 +1318,14 @@ levelzero_present_test() ->
Man1 = insert_manifest_entry(Man0, 1, 0, E0),
?assertMatch(true, levelzero_present(Man1)).

ready_to_delete_combined(Manifest, Filename) ->
case ready_to_delete(Manifest, Filename) of
true ->
{true, clear_pending(Manifest, [Filename], false)};
false ->
{false, clear_pending(Manifest, [], true)}
end.

snapshot_release_test() ->
Man6 = element(7, initial_setup()),
E1 = #manifest_entry{start_key={i, "Bucket1", {"Idx1", "Fld1"}, "K8"},
Expand All @@ -1334,31 +1352,31 @@ snapshot_release_test() ->
Man12 = remove_manifest_entry(Man11, 4, 1, E3),
Man13 = add_snapshot(Man12, pid_a4, 3600),

?assertMatch(false, element(1, ready_to_delete(Man8, "Z1"))),
?assertMatch(false, element(1, ready_to_delete(Man10, "Z2"))),
?assertMatch(false, element(1, ready_to_delete(Man12, "Z3"))),
?assertMatch(false, element(1, ready_to_delete_combined(Man8, "Z1"))),
?assertMatch(false, element(1, ready_to_delete_combined(Man10, "Z2"))),
?assertMatch(false, element(1, ready_to_delete_combined(Man12, "Z3"))),

Man14 = release_snapshot(Man13, pid_a1),
?assertMatch(false, element(1, ready_to_delete(Man14, "Z2"))),
?assertMatch(false, element(1, ready_to_delete(Man14, "Z3"))),
{Bool14, Man15} = ready_to_delete(Man14, "Z1"),
?assertMatch(false, element(1, ready_to_delete_combined(Man14, "Z2"))),
?assertMatch(false, element(1, ready_to_delete_combined(Man14, "Z3"))),
{Bool14, Man15} = ready_to_delete_combined(Man14, "Z1"),
?assertMatch(true, Bool14),

%This doesn't change anything - released snaphsot not the min
Man16 = release_snapshot(Man15, pid_a4),
?assertMatch(false, element(1, ready_to_delete(Man16, "Z2"))),
?assertMatch(false, element(1, ready_to_delete(Man16, "Z3"))),
?assertMatch(false, element(1, ready_to_delete_combined(Man16, "Z2"))),
?assertMatch(false, element(1, ready_to_delete_combined(Man16, "Z3"))),

Man17 = release_snapshot(Man16, pid_a2),
?assertMatch(false, element(1, ready_to_delete(Man17, "Z3"))),
{Bool17, Man18} = ready_to_delete(Man17, "Z2"),
?assertMatch(false, element(1, ready_to_delete_combined(Man17, "Z3"))),
{Bool17, Man18} = ready_to_delete_combined(Man17, "Z2"),
?assertMatch(true, Bool17),

Man19 = release_snapshot(Man18, pid_a3),

io:format("MinSnapSQN ~w~n", [Man19#manifest.min_snapshot_sqn]),

{Bool19, _Man20} = ready_to_delete(Man19, "Z3"),
{Bool19, _Man20} = ready_to_delete_combined(Man19, "Z3"),
?assertMatch(true, Bool19).


Expand Down

0 comments on commit 2648c9a

Please sign in to comment.