Skip to content
This repository has been archived by the owner on May 3, 2024. It is now read-only.

CORTX-29713: m0_conf_pver_status() now returns CRITICAL if max failures reached at any level #1571

Merged
merged 4 commits into from
Apr 6, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 52 additions & 3 deletions conf/pvers.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@
vector[M0_CONF_PVER_LVL_CTRLS], \
vector[M0_CONF_PVER_LVL_DRIVES])

enum {
MAX_FAILURES_NOT_REACHED,
MAX_FAILURES_REACHED,
MAX_FAILURES_EXCEEDED
};

/** Array of int values. */
struct arr_int {
uint32_t ai_count;
Expand Down Expand Up @@ -804,6 +810,30 @@ static int conf_pver_virtual_create(const struct m0_fid *fid,
return rc;
}

/**
* Check if failures at any level has reached or exceeded max allowed failures.
*/
static int tolerance_failure_cmp(struct m0_conf_pver *pv,
const uint32_t *srecd)
{
int i = 0;
int result = MAX_FAILURES_NOT_REACHED;
uint32_t tolerance ;

while(i < M0_CONF_PVER_HEIGHT) {
ivan-alekhin marked this conversation as resolved.
Show resolved Hide resolved
tolerance = pv->pv_u.subtree.pvs_tolerance[i];
/* Ignore the case of srecd[i] == tolerance == 0. */
if (srecd[i] > 0 && srecd[i] == tolerance)
result = MAX_FAILURES_REACHED;
else if (srecd[i] > tolerance) {
mehjoshi marked this conversation as resolved.
Show resolved Hide resolved
result = MAX_FAILURES_EXCEEDED;
break;
}
i++;
}
return result;
}

int m0_conf_pver_status(struct m0_fid *fid,
struct m0_confc *confc,
struct m0_conf_pver_info *out_info)
Expand All @@ -812,9 +842,11 @@ int m0_conf_pver_status(struct m0_fid *fid,
struct m0_conf_pver *pver;
int rc;
int i = 0;
int failures_at_lvl;
uint32_t srecd[M0_CONF_PVER_HEIGHT];
uint32_t failures = 0;
uint32_t K;
uint32_t *tolerance;

M0_ENTRY();
M0_PRE(fid != NULL);
Expand All @@ -838,15 +870,32 @@ int m0_conf_pver_status(struct m0_fid *fid,
while (i < M0_CONF_PVER_HEIGHT)
failures += srecd[i++];

failures_at_lvl = tolerance_failure_cmp(pver, srecd);
tolerance = pver->pv_u.subtree.pvs_tolerance;

/**
* HEALTHY: if no failures in pver.
* DEGRADED: if less than K failures in pver and failures at any level
* has not reached maximum supported failures.
* CRITICAL: if we have K failures or any level has reached maximum
* supported failures.
* DAMAGED: if we have more than K failures or any level has exceeded
* maximum supported failures.
*/
if (failures == 0)
out_info->cpi_state = M0_CPS_HEALTHY;
else if (failures < K)
if (failures > 0 && failures < K &&
failures_at_lvl == MAX_FAILURES_NOT_REACHED)
out_info->cpi_state = M0_CPS_DEGRADED;
else if (failures == K)
if (failures == K || failures_at_lvl == MAX_FAILURES_REACHED)
out_info->cpi_state = M0_CPS_CRITICAL;
else
if (failures > K || failures_at_lvl == MAX_FAILURES_EXCEEDED)
out_info->cpi_state = M0_CPS_DAMAGED;

M0_LOG(M0_DEBUG, "state: %d, failures: %d", out_info->cpi_state, failures);
CONF_PVER_VECTOR_LOG("failed objs of", FID_P(&pver->pv_obj.co_id), srecd);
CONF_PVER_VECTOR_LOG("tolerance of", FID_P(&pver->pv_obj.co_id), tolerance);

return M0_RC(rc);
} M0_EXPORTED(m0_conf_pver_status);

Expand Down