Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into tanabarr/control-re…
Browse files Browse the repository at this point in the history
…intpools-pernode

Features: control
Signed-off-by: Tom Nabarro <tom.nabarro@intel.com>
  • Loading branch information
tanabarr committed Dec 27, 2024
2 parents 7fb508b + 4f16b7e commit 10f42ec
Show file tree
Hide file tree
Showing 238 changed files with 5,452 additions and 1,997 deletions.
8 changes: 5 additions & 3 deletions .github/workflows/bullseye-coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ env:
# the organizational defaults values for these variables
# TODO: we really need to define a list of supported versions (ideally it's no more than 2)
# build is done on the lowest version and test on the highest with a "sanity test"
# stage done on all versions in the list ecept the highest
# stage done on all versions in the list except the highest
EL8_BUILD_VERSION: ${{ vars.EL8_BUILD_VERSION_MASTER }}
EL8_VERSION: ${{ vars.EL8_VERSION_MASTER }}
EL9_BUILD_VERSION: ${{ vars.EL9_BUILD_VERSION_MASTER }}
Expand Down Expand Up @@ -365,7 +365,8 @@ jobs:
- name: Publish test results
if: (!cancelled()) && (success() || failure()) &&
steps.run-test.outcome != 'skipped'
uses: EnricoMi/publish-unit-test-result-action@v2
# yamllint disable-line rule:line-length
uses: EnricoMi/publish-unit-test-result-action@4e7013f9576bd22ffdae979dc6e68cb9ec2aeece # v2.7.0
with:
check_name: ${{ env.STAGE_NAME }} Test Results
github_token: ${{ secrets.GITHUB_TOKEN }}
Expand Down Expand Up @@ -632,7 +633,8 @@ jobs:
- name: Publish test results
if: (!cancelled()) && (success() || failure()) &&
steps.run-test.outcome != 'skipped'
uses: EnricoMi/publish-unit-test-result-action@v2
# yamllint disable-line rule:line-length
uses: EnricoMi/publish-unit-test-result-action@4e7013f9576bd22ffdae979dc6e68cb9ec2aeece # v2.7.0
with:
check_name: ${{ env.STAGE_NAME }} Test Results
github_token: ${{ secrets.GITHUB_TOKEN }}
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/ci2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ jobs:
run: docker cp build-post:/home/daos/daos/nlt-junit.xml ./
- name: Publish NLT test results
if: always()
uses: EnricoMi/publish-unit-test-result-action@v1.17
# yamllint disable-line rule:line-length
uses: EnricoMi/publish-unit-test-result-action@4e7013f9576bd22ffdae979dc6e68cb9ec2aeece # v2.7.0
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
files: nlt-junit.xml
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/landing-builds.yml
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ jobs:
run: docker cp build-post:/home/daos/daos/nlt-junit.xml ./
- name: Publish NLT test results
if: always()
uses: EnricoMi/publish-unit-test-result-action@v1.17
# yamllint disable-line rule:line-length
uses: EnricoMi/publish-unit-test-result-action@4e7013f9576bd22ffdae979dc6e68cb9ec2aeece # v2.7.0
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
files: nlt-junit.xml
Expand Down
12 changes: 9 additions & 3 deletions .github/workflows/linting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,18 @@ jobs:
name: Python isort
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
- name: Checkout code
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
with:
ref: ${{ github.event.pull_request.head.sha }}
- uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
- name: Set up Python environment
uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0
with:
python-version: '3'
- uses: isort/isort-action@f14e57e1d457956c45a19c05a89cccdf087846e5 # v1.1.0
- name: Install extra python packages
run: python3 -m pip install --requirement utils/cq/requirements.txt
- name: Run isort
uses: isort/isort-action@24d8a7a51d33ca7f36c3f23598dafa33f7071326 # v1.1.1
with:
requirementsFiles: "requirements.txt"
- name: Run on SConstruct file.
Expand Down Expand Up @@ -225,6 +230,7 @@ jobs:
- codespell
# - clang-format # not required
- yaml-lint
- copyright
if: (!cancelled())
steps:
- name: Check if any job failed
Expand Down
7 changes: 4 additions & 3 deletions .github/workflows/pr-metadata.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,12 @@ jobs:
id: jira-data
- name: Comment on PR
if: always()
uses: thollander/actions-comment-pull-request@v2
# yamllint disable-line rule:line-length
uses: thollander/actions-comment-pull-request@24bffb9b452ba05a4f3f77933840a6a841d1b32b # v3.0.1
with:
comment_tag: 'jira_query_message'
comment-tag: 'jira_query_message'
message: ${{ steps.jira-data.outputs.message }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Set labels
if: ${{ always() && steps.jira-data.outputs.label != '' }}
uses: actions-ecosystem/action-add-labels@v1
Expand Down
8 changes: 5 additions & 3 deletions .github/workflows/rpm-build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ env:
# the organizational defaults values for these variables
# TODO: we really need to define a list of supported versions (ideally it's no more than 2)
# build is done on the lowest version and test on the highest with a "sanity test"
# stage done on all versions in the list ecept the highest
# stage done on all versions in the list except the highest
EL8_BUILD_VERSION: ${{ vars.EL8_BUILD_VERSION_MASTER }}
EL8_VERSION: ${{ vars.EL8_VERSION_MASTER }}
EL9_BUILD_VERSION: ${{ vars.EL9_BUILD_VERSION_MASTER }}
Expand Down Expand Up @@ -373,7 +373,8 @@ jobs:
- name: Publish test results
if: (!cancelled()) && (success() || failure()) &&
steps.run-test.outcome != 'skipped'
uses: EnricoMi/publish-unit-test-result-action@v2
# yamllint disable-line rule:line-length
uses: EnricoMi/publish-unit-test-result-action@4e7013f9576bd22ffdae979dc6e68cb9ec2aeece # v2.7.0
with:
check_name: ${{ env.STAGE_NAME }} Test Results
github_token: ${{ secrets.GITHUB_TOKEN }}
Expand Down Expand Up @@ -640,7 +641,8 @@ jobs:
- name: Publish test results
if: (!cancelled()) && (success() || failure()) &&
steps.run-test.outcome != 'skipped'
uses: EnricoMi/publish-unit-test-result-action@v2
# yamllint disable-line rule:line-length
uses: EnricoMi/publish-unit-test-result-action@4e7013f9576bd22ffdae979dc6e68cb9ec2aeece # v2.7.0
with:
check_name: ${{ env.STAGE_NAME }} Test Results
github_token: ${{ secrets.GITHUB_TOKEN }}
Expand Down
35 changes: 8 additions & 27 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -364,25 +364,6 @@ pipeline {
stage('Check PR') {
when { changeRequest() }
parallel {
stage('Used Required Git Hooks') {
steps {
catchError(stageResult: 'UNSTABLE', buildResult: 'SUCCESS',
message: 'PR did not get committed with required git hooks. ' +
'Please see utils/githooks/README.md.') {
sh 'if ! ' + cachedCommitPragma('Required-githooks', 'false') + '''; then
echo 'PR did not get committed with required git hooks. ' +
'Please see utils/githooks/README.md.'
exit 1
fi'''
}
}
post {
unsuccessful {
echo 'PR did not get committed with required git hooks. ' +
'Please see utils/githooks/README.md.'
}
}
} // stage('Used Required Git Hooks')
stage('Branch name check') {
when { changeRequest() }
steps {
Expand Down Expand Up @@ -965,7 +946,7 @@ pipeline {
}
}
} // stage('Functional on EL 9')
stage('Functional on Leap 15.5') {
stage('Functional on Leap 15.6') {
when {
beforeAgent true
expression { !skipStage() }
Expand All @@ -986,7 +967,7 @@ pipeline {
job_status_update()
}
} // post
} // stage('Functional on Leap 15.5')
} // stage('Functional on Leap 15.6')
stage('Functional on Ubuntu 20.04') {
when {
beforeAgent true
Expand Down Expand Up @@ -1061,7 +1042,7 @@ pipeline {
job_status_update()
}
}
} // stage('Fault inection testing on EL 8.8')
} // stage('Fault injection testing on EL 8.8')
stage('Test RPMs on EL 8.6') {
when {
beforeAgent true
Expand All @@ -1081,8 +1062,8 @@ pipeline {
rpm_test_post(env.STAGE_NAME, env.NODELIST)
}
}
} // stage('Test CentOS 7 RPMs')
stage('Test RPMs on Leap 15.4') {
} // stage('Test RPMs on EL 8.6')
stage('Test RPMs on Leap 15.5') {
when {
beforeAgent true
expression { ! skipStage() }
Expand All @@ -1096,8 +1077,8 @@ pipeline {
* additionally for this use-case, can't override
ftest_arg with this :-(
script {
'Test RPMs on Leap 15.4': getFunctionalTestStage(
name: 'Test RPMs on Leap 15.4',
'Test RPMs on Leap 15.5': getFunctionalTestStage(
name: 'Test RPMs on Leap 15.5',
pragma_suffix: '',
label: params.CI_UNIT_VM1_LABEL,
next_version: next_version,
Expand Down Expand Up @@ -1133,7 +1114,7 @@ pipeline {
rpm_test_post(env.STAGE_NAME, env.NODELIST)
}
}
} // stage('Test Leap 15 RPMs')
} // stage('Test RPMs on Leap 15.5')
} // parallel
} // stage('Test')
stage('Test Storage Prep on EL 8.8') {
Expand Down
4 changes: 4 additions & 0 deletions ci/parse_ci_envs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ if [ -n "${STAGE_NAME:?}" ]; then
: "${TARGET:=centos9}"
: "${REPO_SPEC:=el-9}"
;;
*Leap\ 15.6*|*leap15.6*|*opensuse15.6*|*sles15.6*)
: "${CHROOT_NAME:=opensuse-leap-15.5-x86_64}"
: "${TARGET:=leap15.6}"
;;
*Leap\ 15.5*|*leap15.5*|*opensuse15.5*|*sles15.5*)
: "${CHROOT_NAME:=opensuse-leap-15.5-x86_64}"
: "${TARGET:=leap15.5}"
Expand Down
2 changes: 2 additions & 0 deletions ci/test_files_to_stash.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ build/*/*/src/common/tests/umem_test,
build/*/*/src/common/tests/umem_test_bmem,
build/*/*/src/bio/smd/tests/smd_ut,
build/*/*/src/tests/rpc/rpc_tests,
build/*/*/src/engine/tests/abt_perf,
build/*/*/src/engine/tests/abt_stack,
src/common/tests/btree.sh,
src/control/run_go_tests.sh,
src/rdb/raft_tests/raft_tests.py,
Expand Down
2 changes: 1 addition & 1 deletion docs/admin/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,7 @@ Alternately, the administrator may erase and re-format the DAOS system to start

### Engines become unavailable

Engines may become unavailable due to server power losses and reboots, network switch failures, etc. After staying unavailable for a certain period of time, these engines may become "excluded" or "errored" in `dmg system query` output. Once the states of all engines stabilize (see [`CRT_EVENT_DELAY`](env_variables.md)), each pool will check whether there is enough redundancy (see [Pool RF](pool_operations.md#pool-redundancy-factor)) to tolerate the unavailability of the "excluded" or "errored" engines. If there is enough redundancy, these engines will be excluded from the pool ("disabled ranks" in `dmg pool query --health-only` output); otherwise, the pool will perform no exclusion ("suspect ranks" in `dmg pool query --health-only` output as described in [Querying a Pool](pool_operations.md#querying-a-pool)) and may become temporarily unavailable (as seen by timeouts of `dmg pool query`, `dmg pool list`, etc.). Similarly, when engines become available, whenever the states of all engines stabilize, each pool will perform the aforementioned check for any unavailable engines that remain.
Engines may become unavailable due to server power losses and reboots, network switch failures, etc. After staying unavailable for a certain period of time, these engines may become "excluded" or "errored" in `dmg system query` output. Once the states of all engines stabilize (see [`CRT_EVENT_DELAY`](env_variables.md)), each pool will check whether there is enough redundancy (see [Pool RF](pool_operations.md#pool-redundancy-factor)) to tolerate the unavailability of the "excluded" or "errored" engines. If there is enough redundancy, these engines will be excluded from the pool ("Disabled ranks" in `dmg pool query --health-only` output); otherwise, the pool will perform no exclusion ("Dead ranks" in `dmg pool query --health-only` output as described in [Querying a Pool](pool_operations.md#querying-a-pool)) and may become temporarily unavailable (as seen by timeouts of `dmg pool query`, `dmg pool list`, etc.). Similarly, when engines become available, whenever the states of all engines stabilize, each pool will perform the aforementioned check for any unavailable engines that remain.

To restore availability as well as capacity and performance, try to start all "excluded" or "errored" engines. Starting all of them at the same time minimizes the chance of triggering rebuild jobs. In many cases, the following command suffices:
```
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ line_length = 99
skip_gitignore = true

[tool.codespell]
skip = './src/control/vendor,./src/control/go.sum,./.git,./src/rdb/raft,./build,./install,./venv,./src/control/vendor/,./src/control/security/testdata/certs/source.txt'
skip = './src/control/vendor/*,./src/control/go.sum,./.git/*,./src/rdb/raft/*,./build/*,./install/*,./venv/*,./src/control/security/testdata/certs/source.txt'
quiet-level = 3
ignore-words = 'ci/codespell.ignores'
builtin = 'clear,rare,informal,names,en-GB_to_en-US'
1 change: 0 additions & 1 deletion site_scons/prereq_tools/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,6 @@ def __init__(self, env, opts):
opts.Add('USE_INSTALLED', 'Comma separated list of preinstalled dependencies', 'none')
opts.Add(('MPI_PKG', 'Specifies name of pkg-config to load for MPI', None))
opts.Add(BoolVariable('FIRMWARE_MGMT', 'Build in device firmware management.', False))
opts.Add(BoolVariable('STACK_MMAP', 'Allocate ABT ULTs stacks with mmap()', False))
opts.Add(BoolVariable('STATIC_FUSE', "Build with static libfuse library", False))
opts.Add(EnumVariable('BUILD_TYPE', "Set the build type", 'release',
['dev', 'debug', 'release'], ignorecase=1))
Expand Down
5 changes: 4 additions & 1 deletion src/bio/bio_buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@ dma_buffer_create(unsigned int init_cnt, int tgt_id)

rc = dma_buffer_grow(buf, init_cnt);
if (rc != 0) {
D_ERROR("Failed to grow DMA buffer (%u chunks)\n", buf->bdb_tot_cnt);
dma_buffer_destroy(buf);
return NULL;
}
Expand Down Expand Up @@ -867,8 +868,10 @@ dma_map_one(struct bio_desc *biod, struct bio_iov *biov, void *arg)
*/
if (pg_cnt > bio_chk_sz) {
chk = dma_alloc_chunk(pg_cnt);
if (chk == NULL)
if (chk == NULL) {
D_ERROR("Failed to allocate %u pages DMA buffer\n", pg_cnt);
return -DER_NOMEM;
}

chk->bdc_type = biod->bd_chk_type;
rc = iod_add_chunk(biod, chk);
Expand Down
31 changes: 22 additions & 9 deletions src/bio/bio_xstream.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
/* SPDK blob parameters */
#define DAOS_BS_CLUSTER_SZ (1ULL << 25) /* 32MB */
/* DMA buffer parameters */
#define DAOS_DMA_CHUNK_CNT_INIT 24 /* Per-xstream init chunks, 192MB */
#define DAOS_DMA_CHUNK_CNT_MAX 128 /* Per-xstream max chunks, 1GB */
#define DAOS_DMA_CHUNK_INIT_PCT 60 /* Default pre-xstream init chunks, in percentage */
#define DAOS_DMA_CHUNK_CNT_MAX 128 /* Default per-xstream max chunks, 1GB */
#define DAOS_DMA_CHUNK_CNT_MIN 32 /* Per-xstream min chunks, 256MB */

/* Max in-flight blob IOs per io channel */
Expand All @@ -48,8 +48,8 @@ unsigned int bio_chk_sz;
unsigned int bio_chk_cnt_max;
/* NUMA node affinity */
unsigned int bio_numa_node;
/* Per-xstream initial DMA buffer size (in chunk count) */
static unsigned int bio_chk_cnt_init;
/* Per-xstream initial DMA buffer size (in percentage) */
static unsigned int bio_chk_init_pct;
/* Diret RDMA over SCM */
bool bio_scm_rdma;
/* Whether SPDK inited */
Expand Down Expand Up @@ -203,6 +203,14 @@ bypass_health_collect()
return nvme_glb.bd_bypass_health_collect;
}

static inline unsigned int
init_chk_cnt()
{
unsigned init_cnt = (bio_chk_cnt_max * bio_chk_init_pct / 100);

return (init_cnt == 0) ? 1 : init_cnt;
}

int
bio_nvme_init(const char *nvme_conf, int numa_node, unsigned int mem_size,
unsigned int hugepage_size, unsigned int tgt_nr, bool bypass_health_collect)
Expand Down Expand Up @@ -249,7 +257,7 @@ bio_nvme_init(const char *nvme_conf, int numa_node, unsigned int mem_size,
*/
glb_criteria.fc_max_csum_errs = UINT32_MAX;

bio_chk_cnt_init = DAOS_DMA_CHUNK_CNT_INIT;
bio_chk_init_pct = DAOS_DMA_CHUNK_INIT_PCT;
bio_chk_cnt_max = DAOS_DMA_CHUNK_CNT_MAX;
bio_chk_sz = ((uint64_t)size_mb << 20) >> BIO_DMA_PAGE_SHIFT;

Expand Down Expand Up @@ -291,8 +299,13 @@ bio_nvme_init(const char *nvme_conf, int numa_node, unsigned int mem_size,
mem_size, tgt_nr);
return -DER_INVAL;
}
D_INFO("Set per-xstream DMA buffer upper bound to %u %uMB chunks\n",
bio_chk_cnt_max, size_mb);

d_getenv_uint("DAOS_DMA_INIT_PCT", &bio_chk_init_pct);
if (bio_chk_init_pct == 0 || bio_chk_init_pct >= 100)
bio_chk_init_pct = DAOS_DMA_CHUNK_INIT_PCT;

D_INFO("Set per-xstream DMA buffer upper bound to %u %uMB chunks, prealloc %u chunks\n",
bio_chk_cnt_max, size_mb, init_chk_cnt());

spdk_bs_opts_init(&nvme_glb.bd_bs_opts, sizeof(nvme_glb.bd_bs_opts));
nvme_glb.bd_bs_opts.cluster_sz = DAOS_BS_CLUSTER_SZ;
Expand Down Expand Up @@ -1560,7 +1573,7 @@ bio_xsctxt_alloc(struct bio_xs_context **pctxt, int tgt_id, bool self_polling)

/* Skip NVMe context setup if the daos_nvme.conf isn't present */
if (!bio_nvme_configured(SMD_DEV_TYPE_MAX)) {
ctxt->bxc_dma_buf = dma_buffer_create(bio_chk_cnt_init, tgt_id);
ctxt->bxc_dma_buf = dma_buffer_create(init_chk_cnt(), tgt_id);
if (ctxt->bxc_dma_buf == NULL) {
D_FREE(ctxt);
*pctxt = NULL;
Expand Down Expand Up @@ -1673,7 +1686,7 @@ bio_xsctxt_alloc(struct bio_xs_context **pctxt, int tgt_id, bool self_polling)
D_ASSERT(d_bdev != NULL);
}

ctxt->bxc_dma_buf = dma_buffer_create(bio_chk_cnt_init, tgt_id);
ctxt->bxc_dma_buf = dma_buffer_create(init_chk_cnt(), tgt_id);
if (ctxt->bxc_dma_buf == NULL) {
D_ERROR("failed to initialize dma buffer\n");
rc = -DER_NOMEM;
Expand Down
2 changes: 1 addition & 1 deletion src/client/dfs/SConscript
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def scons():
libraries = ['daos_common', 'daos', 'uuid', 'gurt']

dfs_src = ['common.c', 'cont.c', 'dir.c', 'file.c', 'io.c', 'lookup.c', 'mnt.c', 'obj.c',
'pipeline.c', 'readdir.c', 'rename.c', 'xattr.c', 'dfs_sys.c']
'pipeline.c', 'readdir.c', 'rename.c', 'xattr.c', 'dfs_sys.c', 'metrics.c']
dfs = denv.d_library('dfs', dfs_src, LIBS=libraries)
denv.Install('$PREFIX/lib64/', dfs)

Expand Down
4 changes: 4 additions & 0 deletions src/client/dfs/common.c
Original file line number Diff line number Diff line change
Expand Up @@ -625,6 +625,8 @@ entry_stat(dfs_t *dfs, daos_handle_t th, daos_handle_t oh, const char *name, siz
stbuf->st_atim.tv_sec = stbuf->st_mtim.tv_sec;
stbuf->st_atim.tv_nsec = stbuf->st_mtim.tv_nsec;
}

DFS_OP_STAT_INCR(dfs, DOS_STAT);
return 0;
}

Expand Down Expand Up @@ -710,6 +712,7 @@ open_dir(dfs_t *dfs, dfs_obj_t *parent, int flags, daos_oclass_id_t cid, struct
D_ASSERT(rc == 0);
dir->d.chunk_size = entry->chunk_size;
dir->d.oclass = entry->oclass;
DFS_OP_STAT_INCR(dfs, DOS_MKDIR);
return 0;
}
}
Expand Down Expand Up @@ -742,6 +745,7 @@ open_dir(dfs_t *dfs, dfs_obj_t *parent, int flags, daos_oclass_id_t cid, struct
oid_cp(&dir->oid, entry->oid);
dir->d.chunk_size = entry->chunk_size;
dir->d.oclass = entry->oclass;
DFS_OP_STAT_INCR(dfs, DOS_OPENDIR);
return 0;
}

Expand Down
Loading

0 comments on commit 10f42ec

Please sign in to comment.