Skip to content

Commit

Permalink
DAOS-7056 object: do not retry internally for migration (#5106) (#5351)
Browse files Browse the repository at this point in the history
1. Do not retry internally for migration, because during
system shutdown, if the migration is inside the loop of
retry, for example keeping refreshing the pool map from
the pool leader, then there is no easy way to stop the
migration process inside the client stack. So let's return
all failure to the migration. If there is failure happens,
migration(rebuild) will requeue the job anyway.

2. Add schedule delay time to rebuild task, instead of sleeping
directly in rebuild_task_ult(), since it might blocking the
current rebuild to finish.

Signed-off-by: Di Wang <di.wang@intel.com>
  • Loading branch information
wangdi authored Apr 8, 2021
1 parent ecd4bb7 commit ee51b35
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 11 deletions.
2 changes: 1 addition & 1 deletion src/include/daos_srv/rebuild.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ typedef enum {

int ds_rebuild_schedule(struct ds_pool *pool, uint32_t map_ver,
struct pool_target_id_list *tgts,
daos_rebuild_opc_t rebuild_op);
daos_rebuild_opc_t rebuild_op, uint64_t delay_sec);
int ds_rebuild_query(uuid_t pool_uuid,
struct daos_rebuild_status *status);
int ds_rebuild_regenerate_task(struct ds_pool *pool);
Expand Down
10 changes: 8 additions & 2 deletions src/object/cli_obj.c
Original file line number Diff line number Diff line change
Expand Up @@ -3619,7 +3619,7 @@ obj_comp_cb(tse_task_t *task, void *data)
DAOS_FAIL_CHECK(DAOS_DTX_NO_RETRY))
obj_auxi->io_retry = 0;

if (pm_stale || obj_auxi->io_retry)
if (!obj_auxi->no_retry && (pm_stale || obj_auxi->io_retry))
obj_retry_cb(task, obj, obj_auxi, pm_stale);

if (!obj_auxi->io_retry) {
Expand Down Expand Up @@ -4079,8 +4079,10 @@ dc_obj_fetch_task(tse_task_t *task)
if (obj_auxi->ec_wait_recov)
goto out_task;

if (args->extra_flags & DIOF_FOR_MIGRATION)
if (args->extra_flags & DIOF_FOR_MIGRATION) {
obj_auxi->flags |= ORF_FOR_MIGRATION;
obj_auxi->no_retry = 1;
}

if (args->extra_flags & DIOF_CHECK_EXISTENCE) {
obj_auxi->flags |= ORF_CHECK_EXISTENCE;
Expand Down Expand Up @@ -4670,6 +4672,10 @@ obj_list_common(tse_task_t *task, int opc, daos_obj_list_t *args)
D_GOTO(out_task, rc);
}

if (args->dkey_anchor != NULL &&
daos_anchor_get_flags(args->dkey_anchor) & DIOF_FOR_MIGRATION)
obj_auxi->no_retry = 1;

if (obj_is_ec(obj))
obj_auxi->is_ec_obj = 1;

Expand Down
7 changes: 4 additions & 3 deletions src/pool/srv_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -3968,7 +3968,7 @@ ds_pool_update_internal(uuid_t pool_uuid, struct pool_target_id_list *tgts,
* during reintegration/addition
*/
rc = ds_rebuild_schedule(svc->ps_pool, map_version, tgts,
RB_OP_RECLAIM);
RB_OP_RECLAIM, 0);
if (rc != 0) {
D_ERROR("failed to schedule reclaim rc: "DF_RC"\n",
DP_RC(rc));
Expand Down Expand Up @@ -4370,7 +4370,8 @@ ds_pool_update(uuid_t pool_uuid, crt_opcode_t opc,
D_DEBUG(DF_DSMS, "map ver %u/%u\n", map_version ? *map_version : -1,
tgt_map_ver);
if (tgt_map_ver != 0) {
rc = ds_rebuild_schedule(pool, tgt_map_ver, &target_list, op);
rc = ds_rebuild_schedule(pool, tgt_map_ver, &target_list, op,
0);
if (rc != 0) {
D_ERROR("rebuild fails rc: "DF_RC"\n", DP_RC(rc));
D_GOTO(out, rc);
Expand Down Expand Up @@ -4517,7 +4518,7 @@ pool_extend_internal(uuid_t pool_uuid, struct rsvc_hint *hint,

/* Schedule an extension rebuild for those targets */
rc = ds_rebuild_schedule(svc->ps_pool, *map_version_p, &tgts,
RB_OP_EXTEND);
RB_OP_EXTEND, 0);
if (rc != 0) {
D_ERROR("failed to schedule extend rc: "DF_RC"\n", DP_RC(rc));
D_GOTO(out_lock, rc);
Expand Down
3 changes: 2 additions & 1 deletion src/rebuild/rebuild_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,9 @@ struct rebuild_task {
d_list_t dst_list;
uuid_t dst_pool_uuid;
struct pool_target_id_list dst_tgts;
uint32_t dst_map_ver;
daos_rebuild_opc_t dst_rebuild_op;
uint64_t dst_schedule_time;
uint32_t dst_map_ver;
};

/* Per pool structure in TLS to check pool rebuild status
Expand Down
19 changes: 15 additions & 4 deletions src/rebuild/srv.c
Original file line number Diff line number Diff line change
Expand Up @@ -1298,7 +1298,7 @@ rebuild_task_ult(void *arg)
rgt->rgt_status.rs_done = 0;
ret = ds_rebuild_schedule(pool, task->dst_map_ver,
&task->dst_tgts,
task->dst_rebuild_op);
task->dst_rebuild_op, 5);
if (ret != 0)
D_ERROR("reschedule "DF_RC"\n", DP_RC(ret));
else
Expand Down Expand Up @@ -1380,7 +1380,13 @@ rebuild_ults(void *arg)
* wait to start the next operation until the current
* one completes
*/
if (pool_is_rebuilding(task->dst_pool_uuid))
uint64_t cur_ts = 0;

rc = daos_gettime_coarse(&cur_ts);
D_ASSERT(rc == 0);

if (cur_ts < task->dst_schedule_time ||
pool_is_rebuilding(task->dst_pool_uuid))
continue;

rc = dss_ult_create(rebuild_task_ult, task,
Expand Down Expand Up @@ -1546,10 +1552,11 @@ rebuild_print_list_update(const uuid_t uuid, const uint32_t map_ver,
int
ds_rebuild_schedule(struct ds_pool *pool, uint32_t map_ver,
struct pool_target_id_list *tgts,
daos_rebuild_opc_t rebuild_op)
daos_rebuild_opc_t rebuild_op, uint64_t delay_sec)
{
struct rebuild_task *task;
int rc;
uint64_t cur_ts = 0;

if (pool->sp_stopping) {
D_DEBUG(DB_REBUILD, DF_UUID" is stopping,"
Expand All @@ -1568,6 +1575,10 @@ ds_rebuild_schedule(struct ds_pool *pool, uint32_t map_ver,
if (task == NULL)
return -DER_NOMEM;

rc = daos_gettime_coarse(&cur_ts);
D_ASSERT(rc == 0);

task->dst_schedule_time = cur_ts + delay_sec;
task->dst_map_ver = map_ver;
task->dst_rebuild_op = rebuild_op;
uuid_copy(task->dst_pool_uuid, pool->sp_uuid);
Expand Down Expand Up @@ -1626,7 +1637,7 @@ regenerate_task_internal(struct ds_pool *pool, struct pool_target *tgts,
id_list.pti_number = 1;

rc = ds_rebuild_schedule(pool, tgt->ta_comp.co_fseq,
&id_list, rebuild_op);
&id_list, rebuild_op, 0);
if (rc) {
D_ERROR(DF_UUID" schedule op %d ver %d failed: "
DF_RC"\n",
Expand Down

0 comments on commit ee51b35

Please sign in to comment.