Skip to content
This repository has been archived by the owner on May 3, 2024. It is now read-only.

Commit

Permalink
EOS-25111 be/log_discard: avoid race between wait() and flush op (#1092)
Browse files Browse the repository at this point in the history
m0_be_log_discard_flush() may finish waiting after io is done, but
before lds_discard_ast() is posted. It would lead to
be_log_discard_wait() to not to wait for the ast to be finished, which
leads to an assertion failure [1]. In this case traces look like [2] and
stack backtrace looks like [3].

This patch moves flush op to DONE at the end of the ast, which makes
flush() wait until the ast is actually executed.

The bug was discovered during EOS-25005 patch testing.

[1]

motr[24531]:  a3b0  FATAL  [lib/assert.c:50:m0_panic]  panic: (!ld->lds_sync_in_progress) at m0_be_log_discard_fini() (be/log_discard.c:143)  [git: 2.0.0-307-72-g0e307f7b-dirty] /w/var/m0ut/m0trace.24531
Motr panic: (!ld->lds_sync_in_progress) at m0_be_log_discard_fini() be/log_discard.c:143 (errno: 4) (last failed: none) [git: 2.0.0-307-72-g0e307f7b-dirty] pid: 24531  /w/var/m0ut/m0trace.24531

[2]

92998540  157294.289.381909  7f86d97f98e0  DEBUG   be/log_discard.c:288:be_log_discard_sync_done_cb    ld=0x7f86f709d8f8
92998541  157294.289.382245  7f86d97f98b0  CALL    be/op.c:154:m0_be_op_reset                          > op=0x7f86f709da40
...
92998549  157294.289.431085  7f86d97f9c40  DEBUG   sm/sm.c:158:m0_sm_asts_run                          grp=0x22d96b0 forkq_pop: 0x7f86f709de10 <- 0x7f86f6049da0
92998550  157294.289.432156  7f86d97f9c00  CALL    be/log_discard.c:256:be_log_discard_ast             > ld=0x7f86f709d8f8
92998551  157294.289.433624  7ffd2e9e10c0  FATAL   lib/assert.c:50:m0_panic                            panic: (!ld->lds_sync_in_progress) at m0_be_log_discard_fini() (be/log_discard.c:143)  [git: 2.0.0-307-37-gd35074f6-dirty] /w/var/m0ut/m0trace.22409
92998552  157294.289.435003  7f86d97f9ba0  CALL    be/log_discard.c:229:be_log_discard_item_discard    > ld=0x7f86f709d8f8 ldi=0x7f862039bf40
92998553  157294.289.436948  7f86d97f9b30  DEBUG   be/tx_group_format.c:549:m0_be_group_format_discard   log=0x7f86f709a838 ldi=0x7f862039bf40 ext=[0, 7000)
92998554  157294.289.439155  7f86d97f9ad0  CALL    be/log.c:526:m0_be_log_record_discard               > log=(lg_current=118784 lg_discarded=0 lg_free=134082560) size=28672
...
92998588  157294.289.570768  7f86d97f9ba0  CALL    be/log_discard.c:229:be_log_discard_item_discard    > ld=0x7f86f709d8f8 ldi=0x7f862039bb80
92998589  157294.289.571102  7f86d97f9b30  DEBUG   be/tx_group_format.c:549:m0_be_group_format_discard   log=0x7f86f709a838 ldi=0x7f862039bb80 ext=[1b000, 1d000)
92998590  157294.289.571601  7f86d97f9ad0  CALL    be/log.c:526:m0_be_log_record_discard               > log=(lg_current=118784 lg_discarded=110592 lg_free=134193152) size=8192
92998591  157294.289.572156  7f86d97f9ad0  DEBUG   be/log.c:545:m0_be_log_record_discard               log=(lg_current=118784 lg_discarded=118784 lg_free=134201344) size=8192
92998592  157294.289.572524  7f86d97f9a90  CALL    be/engine.c:865:m0_be_engine_got_log_space_cb       > en=0x7f86f709a6c8 log=0x7f86f709a838
92998593  157294.289.573643  7f86d97f9b70  DEBUG   be/log_discard.c:451:m0_be_log_discard_item_put     ld=0x7f86f709d8f8 ldi=0x7f862039bb80

[3]

  Thread 1 (Thread 0x7f8701248840 (LWP 22409)):
  #0  0x00007f86f310f277 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56
  #1  0x00007f86f3110968 in __GI_abort () at abort.c:90
  #2  0x00007f86f50ffe1e in m0_arch_panic (c=0x7f86f55d4840 <__pctx.7291>, ap=0x7ffd2e9e1168) at lib/user_space/uassert.c:131
  #3  0x00007f86f50e0a78 in m0_panic (ctx=0x7f86f55d4840 <__pctx.7291>) at lib/assert.c:52
  #4  0x00007f86f4fab1a2 in m0_be_log_discard_fini (ld=0x7f86f709d8f8 <be+17944>) at be/log_discard.c:143
  #5  0x00007f86f4f96403 in be_domain_level_leave (module=0x7f86f70992e0 <be>) at be/domain.c:906
  #6  0x00007f86f5171e09 in m0_module_fini (module=0x7f86f70992e0 <be>, level=-1) at module/module.c:154
  #7  0x00007f86f67c61f3 in m0_be_ut_backend_fini (ut_be=0x7f86f70992e0 <be>) at be/ut/helper.c:448
  #8  0x00007f86f6816645 in iter_ut_fini (svc=0x7f857252b168) at dix/cm/ut/iter_ut.c:708
  #9  0x00007f86f6816a39 in empty_cctg () at dix/cm/ut/iter_ut.c:786
  #10 0x00007f86f69a3736 in run_test (test=0x64a708 <dix_cm_iter_ut+4552>, max_name_len=43) at ut/ut.c:390
  #11 0x00007f86f69a3a28 in run_suite (suite=0x649540 <dix_cm_iter_ut>, max_name_len=43) at ut/ut.c:459
  #12 0x00007f86f69a3ccc in tests_run_all (m=0x7f870102f4a0 <ut.12684>) at ut/ut.c:513
  #13 0x00007f86f69a3d2a in m0_ut_run () at ut/ut.c:539
  #14 0x0000000000404a42 in main (argc=3, argv=0x7ffd2e9e1d58) at ut/m0ut.c:525

Signed-off-by: Maksym Medvied <maksym.medvied@seagate.com>
  • Loading branch information
max-seagate authored Oct 5, 2021
1 parent 190d3b6 commit db9f195
Showing 1 changed file with 15 additions and 10 deletions.
25 changes: 15 additions & 10 deletions be/log_discard.c
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,17 @@ static bool be_log_discard_is_locked(struct m0_be_log_discard *ld)
return m0_mutex_is_locked(&ld->lds_lock);
}

static void be_log_discard_flush_finished(struct m0_be_log_discard *ld)
{
M0_PRE(be_log_discard_is_locked(ld));
M0_PRE(ld->lds_flush_op != NULL);
M0_PRE(ld_start_tlist_is_empty(&ld->lds_start_q));

m0_be_op_active(ld->lds_flush_op);
m0_be_op_done(ld->lds_flush_op);
ld->lds_flush_op = NULL;
}

static void be_log_discard_check_sync(struct m0_be_log_discard *ld,
bool force)
{
Expand All @@ -197,18 +208,10 @@ static void be_log_discard_check_sync(struct m0_be_log_discard *ld,
break;
ld->lds_sync_item = ldi;
} m0_tl_endfor;
if (ld->lds_sync_item == NULL && ld->lds_flush_op != NULL) {
M0_ASSERT(ld_start_tlist_is_empty(&ld->lds_start_q));
m0_be_op_active(ld->lds_flush_op);
m0_be_op_done(ld->lds_flush_op);
}
if (ld->lds_sync_item == NULL && ld->lds_flush_op != NULL)
be_log_discard_flush_finished(ld);
if (ld->lds_sync_item != NULL) {
ld->lds_sync_in_progress = true;
if (ld->lds_flush_op != NULL) {
m0_be_op_set_add(ld->lds_flush_op,
&ld->lds_sync_op);
ld->lds_flush_op = NULL;
}
M0_LOG(M0_DEBUG, "ld=%p lds_sync_item=%p",
ld, ld->lds_sync_item);
/* be_log_discard_sync_done_cb() locks ld */
Expand Down Expand Up @@ -274,6 +277,8 @@ static void be_log_discard_ast(struct m0_sm_group *grp,
be_log_discard_check_sync(ld, false);
ld->lds_discard_ast_posted = false;

if (ld->lds_flush_op != NULL)
be_log_discard_flush_finished(ld);
if (ld->lds_discard_waiting)
m0_semaphore_up(&ld->lds_discard_wait_sem);
be_log_discard_unlock(ld);
Expand Down

0 comments on commit db9f195

Please sign in to comment.