Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-16829 client: Fix daos pool query regression #15634

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions docs/admin/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -554,6 +554,21 @@ To resolve the issue:

Alternately, the administrator may erase and re-format the DAOS system to start over fresh using the new addresses.

### Engines become unavailable

Engines may become unavailable due to server power losses and reboots, network switch failures, etc. After staying unavailable for a certain period of time, these engines may become "excluded" or "errored" in `dmg system query` output. Once the states of all engines stabilize (see [`CRT_EVENT_DELAY`](env_variables.md)), each pool will check whether there is enough redundancy (see [Pool RF](pool_operations.md#pool-redundancy-factor)) to tolerate the unavailability of the "excluded" or "errored" engines. If there is enough redundancy, these engines will be excluded from the pool ("Disabled ranks" in `dmg pool query --health-only` output); otherwise, the pool will perform no exclusion ("Dead ranks" in `dmg pool query --health-only` output as described in [Querying a Pool](pool_operations.md#querying-a-pool)) and may become temporarily unavailable (as seen by timeouts of `dmg pool query`, `dmg pool list`, etc.). Similarly, when engines become available, whenever the states of all engines stabilize, each pool will perform the aforementioned check for any unavailable engines that remain.

To restore availability as well as capacity and performance, try to start all "excluded" or "errored" engines. Starting all of them at the same time minimizes the chance of triggering rebuild jobs. In many cases, the following command suffices:
```
$ dmg system start
```
If some pools remain unavailable (e.g., `dmg pool list` keeps timing out) after the previous step, restart the whole system:
```
$ dmg system stop --force
$ dmg system start
```
If some engines have been excluded from certain pools, and they are available again, reintegrate them to the pools.

## Diagnostic and Recovery Tools

!!! WARNING : Please be careful and use this tool under supervision of DAOS support team.
Expand Down
45 changes: 42 additions & 3 deletions src/client/api/pool.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2015-2023 Intel Corporation.
* (C) Copyright 2015-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -91,15 +91,54 @@ daos_pool_query(daos_handle_t poh, d_rank_list_t **ranks, daos_pool_info_t *info
return -DER_INVAL;
}

if (ranks != NULL && info != NULL &&
(info->pi_bits & (DPI_ENGINES_ENABLED | DPI_ENGINES_DISABLED)) ==
(DPI_ENGINES_ENABLED | DPI_ENGINES_DISABLED)) {
D_ERROR("enabled and disabled not supported in v1 query\n");
return -DER_NOTSUPPORTED;
}

rc = dc_task_create(dc_pool_query, NULL, ev, &task);
if (rc)
return rc;

args = dc_task_get_args(task);
args->poh = poh;
args->ranks = ranks;
args->poh = poh;
args->info = info;
args->prop = pool_prop;
if (info != NULL && (info->pi_bits & DPI_ENGINES_ENABLED) != 0)
args->enabled_ranks = ranks;
else
args->disabled_ranks = ranks;

return dc_task_schedule(task, true);
}

int
daos_pool_query_v2(daos_handle_t poh, d_rank_list_t **enabled_ranks, d_rank_list_t **disabled_ranks,
daos_pool_info_t *info, daos_prop_t *pool_prop, daos_event_t *ev)
{
daos_pool_query_t *args;
tse_task_t *task;
int rc;

DAOS_API_ARG_ASSERT(*args, POOL_QUERY);

if (pool_prop != NULL && !daos_prop_valid(pool_prop, true, false)) {
D_ERROR("invalid pool_prop parameter.\n");
return -DER_INVAL;
}

rc = dc_task_create(dc_pool_query, NULL, ev, &task);
if (rc)
return rc;

args = dc_task_get_args(task);
args->poh = poh;
args->enabled_ranks = enabled_ranks;
args->disabled_ranks = disabled_ranks;
args->info = info;
args->prop = pool_prop;

return dc_task_schedule(task, true);
}
Expand Down
92 changes: 16 additions & 76 deletions src/control/cmd/daos/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -295,21 +295,21 @@ func convertPoolInfo(pinfo *C.daos_pool_info_t) (*daos.PoolInfo, error) {
return poolInfo, nil
}

func queryPoolRankLists(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.PoolInfo, error) {
var rlPtr **C.d_rank_list_t = nil
var rl *C.d_rank_list_t = nil
func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.PoolInfo, error) {
poolInfo := &daos.PoolInfo{}

if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) || queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) ||
queryMask.HasOption(daos.PoolQueryOptionDeadEngines) {
rlPtr = &rl
}
var enabledRanks *C.d_rank_list_t
var disabledRanks *C.d_rank_list_t

cPoolInfo := C.daos_pool_info_t{
pi_bits: C.uint64_t(queryMask),
}

rc := C.daos_pool_query(poolHdl, rlPtr, &cPoolInfo, nil, nil)
defer C.d_rank_list_free(rl)
rc := C.daos_pool_query_v2(poolHdl, &enabledRanks, &disabledRanks, &cPoolInfo, nil, nil)
defer func() {
C.d_rank_list_free(enabledRanks)
C.d_rank_list_free(disabledRanks)
}()
if err := daosError(rc); err != nil {
return nil, err
}
Expand All @@ -319,79 +319,19 @@ func queryPoolRankLists(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (
return nil, err
}

if rlPtr != nil {
rs, err := rankSetFromC(rl)
if enabledRanks != nil {
rs, err := rankSetFromC(enabledRanks)
if err != nil {
return nil, err
}
if queryMask.HasOption(daos.PoolQueryOptionEnabledEngines) {
poolInfo.EnabledRanks = rs
}
if queryMask.HasOption(daos.PoolQueryOptionDisabledEngines) {
poolInfo.DisabledRanks = rs
}
if queryMask.HasOption(daos.PoolQueryOptionDeadEngines) {
poolInfo.DeadRanks = rs
}
poolInfo.EnabledRanks = rs
}

return poolInfo, nil
}
func queryPool(poolHdl C.daos_handle_t, queryMask daos.PoolQueryMask) (*daos.PoolInfo, error) {
poolInfo := &daos.PoolInfo{}
originalMask := queryMask // Save the original queryMask

// Function to handle the query and return a single RankList
queryAndUpdate := func(option string) error {
// Clear previous options and set new option
queryMask.ClearAll()
queryMask.SetOptions(option)

poolInfo1, err := queryPoolRankLists(poolHdl, queryMask)
if disabledRanks != nil {
rs, err := rankSetFromC(disabledRanks)
if err != nil {
return err
}

switch option {
case daos.PoolQueryOptionEnabledEngines:
poolInfo.EnabledRanks = poolInfo1.EnabledRanks
case daos.PoolQueryOptionDisabledEngines:
poolInfo.DisabledRanks = poolInfo1.DisabledRanks
case daos.PoolQueryOptionDeadEngines:
poolInfo.DeadRanks = poolInfo1.DeadRanks
}
return nil
}

// Preprocess queryMask, select one option for the first query
var firstOption string
if originalMask.HasOption(daos.PoolQueryOptionEnabledEngines) {
firstOption = daos.PoolQueryOptionEnabledEngines
} else if originalMask.HasOption(daos.PoolQueryOptionDisabledEngines) {
firstOption = daos.PoolQueryOptionDisabledEngines
} else if originalMask.HasOption(daos.PoolQueryOptionDeadEngines) {
firstOption = daos.PoolQueryOptionDeadEngines
}

// Perform the first query to get basic information
if err := queryAndUpdate(firstOption); err != nil {
return nil, err
}

// Check the original query mask and update fields as needed
queryOptions := []string{
daos.PoolQueryOptionEnabledEngines,
daos.PoolQueryOptionDisabledEngines,
daos.PoolQueryOptionDeadEngines,
}

// Process each option sequentially
for _, opt := range queryOptions {
if originalMask.HasOption(opt) && opt != firstOption {
if err := queryAndUpdate(opt); err != nil {
return nil, err
}
return nil, err
}
poolInfo.DisabledRanks = rs
}

return poolInfo, nil
Expand Down
16 changes: 16 additions & 0 deletions src/control/common/proto/logging.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,22 @@ func Debug(msg proto.Message) string {
for i, b := range m.TierBytes {
fmt.Fprintf(&bld, "%d:%d ", i, b)
}
case *mgmtpb.PoolQueryReq:
fmt.Fprintf(&bld, "%T id:%s qm:%s", m, m.Id, daos.PoolQueryMask(m.QueryMask))
case *mgmtpb.PoolQueryResp:
fmt.Fprintf(&bld, "%T status:%s uuid:%s qm:%s map:%d tot(eng/tgts):%d/%d ver(p/u):%d/%d svc_ldr:%d ",
m, daos.Status(m.Status), m.Uuid, daos.PoolQueryMask(m.QueryMask), m.Version,
m.TotalEngines, m.TotalTargets, m.PoolLayoutVer, m.UpgradeLayoutVer, m.SvcLdr)
ranks := &ranklist.RankSet{}
for _, r := range m.SvcReps {
ranks.Add(ranklist.Rank(r))
}
fmt.Fprintf(&bld, "svc_ranks:%s ", ranks.String())
fmt.Fprintf(&bld, "ena_ranks:%s ", m.EnabledRanks)
fmt.Fprintf(&bld, "dis_ranks:%s ", m.DisabledRanks)
fmt.Fprintf(&bld, "dead_ranks:%s ", m.DeadRanks)
fmt.Fprintf(&bld, "rebuild:%+v ", m.Rebuild)
fmt.Fprintf(&bld, "tier_stats:%+v ", m.TierStats)
case *mgmtpb.PoolEvictReq:
fmt.Fprintf(&bld, "%T pool:%s", m, m.Id)
if len(m.Handles) > 0 {
Expand Down
46 changes: 46 additions & 0 deletions src/include/daos_pool.h
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,52 @@ int
daos_pool_query(daos_handle_t poh, d_rank_list_t **ranks, daos_pool_info_t *info,
daos_prop_t *pool_prop, daos_event_t *ev);

/**
* Query pool information, including multiple rank lists (enabled, disabled). User
* should provide at least one of \a info or the \a *disabled_ranks list as output buffer.
*
* \param[in] poh Pool connection handle.
* \param[out] enabled_ranks
* Optional, returned enabled pool storage engine ranks. This list will
* be populated if #info is not NULL and #pi_bits has DPI_ENGINES_ENABLED set.
* The caller is responsible for freeing the list with d_rank_list_free().
* \param[out] disabled_ranks
* Optional, returned disabled pool storage engine ranks.
* If #info is NULL, this list will be populated with the ranks of
* all engines with any targets disabled. If #info is not NULL, this
* list will be populated if #pi_bits has DPI_ENGINES_DISABLED set.
* The caller is responsible for freeing the list with d_rank_list_free().
* \param[in,out]
* info Optional, returned pool information,
* see daos_pool_info_bit.
* \param[out] pool_prop
* Optional, returned pool properties.
* If it is NULL, then needs not query the properties.
* If pool_prop is non-NULL but its dpp_entries is NULL,
* will query all pool properties, DAOS internally
* allocates the needed buffers and assign pointer to
* dpp_entries.
* If pool_prop's dpp_nr > 0 and dpp_entries is non-NULL,
* will query the properties for specific dpe_type(s), DAOS
* internally allocates the needed buffer for dpe_str or
* dpe_val_ptr, if the dpe_type with immediate value then
* will directly assign it to dpe_val.
* User can free the associated buffer by calling
* daos_prop_free().
* \param[in] ev Completion event, it is optional and can be NULL.
* The function will run in blocking mode if \a ev is NULL.
*
* \return These values will be returned by \a ev::ev_error in
* non-blocking mode:
* 0 Success
* -DER_INVAL Invalid parameter
* -DER_UNREACH Network is unreachable
* -DER_NO_HDL Invalid pool handle
*/
int
daos_pool_query_v2(daos_handle_t poh, d_rank_list_t **enabled_ranks, d_rank_list_t **disabled_ranks,
daos_pool_info_t *info, daos_prop_t *pool_prop, daos_event_t *ev);

/**
* Query information of storage targets within a DAOS pool.
*
Expand Down
6 changes: 4 additions & 2 deletions src/include/daos_task.h
Original file line number Diff line number Diff line change
Expand Up @@ -206,8 +206,10 @@ typedef struct {
typedef struct {
/** Pool open handle. */
daos_handle_t poh;
/** Optional, returned storage ranks in this pool. */
d_rank_list_t **ranks;
/** Optional, returned enabled storage ranks in this pool. */
d_rank_list_t **enabled_ranks;
/** Optional, returned disabled storage ranks in this pool. */
d_rank_list_t **disabled_ranks;
/** Optional, returned pool information. */
daos_pool_info_t *info;
/** Optional, returned pool properties. */
Expand Down
Loading
Loading