Skip to content

Commit

Permalink
ADLR/megatron-lm!2467 - ci: Add memory consumption to tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ko3n1g committed Jan 3, 2025
1 parent 076972e commit 9238a5e
Show file tree
Hide file tree
Showing 356 changed files with 1,213 additions and 4,284 deletions.
3 changes: 3 additions & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ stages:

default:
interruptible: true
retry:
max: 2
when: runner_system_failure

variables:
UNIT_TEST:
Expand Down
2 changes: 1 addition & 1 deletion .gitlab/stages/00.pre.yml
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ pre:maybe_cherry_pick_commit:
"type": "section",
"text": {
"type": "mrkdwn",
"text": ":alert: Cherrypick bot 🤖: Cherry-pick of <'$URL'|!'$MR_ID'> failed\ncc '$SLACK_ADMIN'"
"text": "beep boop 🤖: Cherry-pick of <'$URL'|!'$MR_ID'> failed\ncc '$SLACK_ADMIN'"
}
}
]
Expand Down
69 changes: 27 additions & 42 deletions .gitlab/stages/01.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -103,67 +103,53 @@ test:unit_tests_configure:
- |
A100_CLUSTER=$([[ "$FUNCTIONAL_TEST_CLUSTER_A100" != "" ]] && echo $FUNCTIONAL_TEST_CLUSTER_A100 || echo $DEFAULT_A100_CLUSTER)
H100_CLUSTER=$([[ "$FUNCTIONAL_TEST_CLUSTER_H100" != "" ]] && echo $FUNCTIONAL_TEST_CLUSTER_H100 || echo $DEFAULT_H100_CLUSTER)
- |
ARGS=(
"--scope unit-tests"
"--n-repeat ${UNIT_TEST_REPEAT}"
"--time-limit $(( UNIT_TEST_TIMEOUT * 60 ))"
"--test-cases all"
"--a100-cluster dgxa100_dracooci-ord"
"--h100-cluster dgxh100_coreweave"
"--h100-partition batch_short,batch"
"--container-image ${UTILITY_IMAGE}"
"--container-tag ${CI_PIPELINE_ID}"
"--dependent-job test:unit_tests_configure"
)
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
--scope "unit-tests" \
--environment lts \
--n-repeat "${UNIT_TEST_REPEAT}" \
--time-limit "$(( UNIT_TEST_TIMEOUT * 60 ))" \
--test-cases "all" \
--a100-cluster "dgxa100_dracooci-ord" \
--h100-cluster "dgxh100_coreweave" \
--container-image ${UTILITY_IMAGE} \
--container-tag ${CI_PIPELINE_ID} \
--dependent-job "test:unit_tests_configure" \
${ARGS[@]} \
--environment "lts" \
--tag "legacy" \
--output-path "unit-test-job-lts-legacy.yaml"
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
--scope "unit-tests" \
--environment lts \
--n-repeat "${UNIT_TEST_REPEAT}" \
--time-limit "$(( UNIT_TEST_TIMEOUT * 60 ))" \
--test-cases "all" \
--a100-cluster "dgxa100_dracooci-ord" \
--h100-cluster "dgxh100_coreweave" \
--container-image ${UTILITY_IMAGE} \
--container-tag ${CI_PIPELINE_ID} \
--dependent-job "test:unit_tests_configure" \
${ARGS[@]} \
--environment "lts" \
--tag "latest" \
--output-path "unit-test-job-lts-latest.yaml"
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
--scope "unit-tests" \
--environment dev \
--n-repeat "${UNIT_TEST_REPEAT}" \
--time-limit "$(( UNIT_TEST_TIMEOUT * 60 ))" \
--test-cases "all" \
--a100-cluster "dgxa100_dracooci-ord" \
--h100-cluster "dgxh100_coreweave" \
--container-image ${UTILITY_IMAGE} \
--container-tag ${CI_PIPELINE_ID} \
--dependent-job "test:unit_tests_configure" \
${ARGS[@]} \
--environment "dev" \
--tag "legacy" \
--output-path "unit-test-job-dev-legacy.yaml"
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
--scope "unit-tests" \
--environment dev \
--n-repeat "${UNIT_TEST_REPEAT}" \
--time-limit "$(( UNIT_TEST_TIMEOUT * 60 ))" \
--test-cases "all" \
--a100-cluster "dgxa100_dracooci-ord" \
--h100-cluster "dgxh100_coreweave" \
--container-image ${UTILITY_IMAGE} \
--container-tag ${CI_PIPELINE_ID} \
--dependent-job "test:unit_tests_configure" \
${ARGS[@]} \
--environment "dev" \
--tag "latest" \
--output-path "unit-test-job-dev-latest.yaml"
rules:
- if: $UNIT_TEST == 'yes' && $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
allow_failure: true
when: on_success
- if: $UNIT_TEST == 'yes' && $UNIT_TEST_REPEAT != '0'
when: on_success
artifacts:
paths:
- unit-test-job-dev-legacy.yaml
Expand Down Expand Up @@ -482,4 +468,3 @@ test:notify_release:
else
eval "$CMD"
fi
51 changes: 33 additions & 18 deletions .gitlab/stages/02.functional-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,34 +47,30 @@ functional:configure:
else
RELEASE_ARGS=()
fi
- |
ARGS=(
"--scope $FUNCTIONAL_TEST_SCOPE"
"--n-repeat $FUNCTIONAL_TEST_REPEAT"
"--time-limit $FUNCTIONAL_TEST_TIME_LIMIT"
"--test-cases $FUNCTIONAL_TEST_CASES"
"--a100-cluster $A100_CLUSTER"
"--h100-cluster $H100_CLUSTER"
"--container-image ${UTILITY_IMAGE}"
"--container-tag ${CI_PIPELINE_ID}"
"--dependent-job functional:configure"
)
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
--scope $FUNCTIONAL_TEST_SCOPE \
${ARGS[@]} \
--environment dev \
--n-repeat "$FUNCTIONAL_TEST_REPEAT" \
--time-limit "$FUNCTIONAL_TEST_TIME_LIMIT" \
--test-cases $FUNCTIONAL_TEST_CASES \
--a100-cluster $A100_CLUSTER \
--h100-cluster $H100_CLUSTER \
--container-image ${UTILITY_IMAGE} \
--container-tag ${CI_PIPELINE_ID} \
--dependent-job "functional:configure" \
--output-path "functional-test-job-dev.yaml" \
${RELEASE_ARGS[@]}
- |
export PYTHONPATH=$(pwd)
python tests/test_utils/python_scripts/generate_jet_trigger_job.py \
--scope $FUNCTIONAL_TEST_SCOPE \
${ARGS[@]} \
--environment lts \
--n-repeat "$FUNCTIONAL_TEST_REPEAT" \
--time-limit "$FUNCTIONAL_TEST_TIME_LIMIT" \
--test-cases $FUNCTIONAL_TEST_CASES \
--a100-cluster $A100_CLUSTER \
--h100-cluster $H100_CLUSTER \
--container-image ${UTILITY_IMAGE} \
--container-tag ${CI_PIPELINE_ID} \
--dependent-job "functional:configure" \
--output-path "functional-test-job-lts.yaml" \
${RELEASE_ARGS[@]}
artifacts:
Expand Down Expand Up @@ -141,3 +137,22 @@ functional:notify:
- if: $CI_PIPELINE_SOURCE == "schedule" && $FUNCTIONAL_TEST == "yes"
when: always
- when: never

functional:download_golden_values:
extends: [.functional_tests_rules]
image: ${UTILITY_IMAGE}:${CI_PIPELINE_ID}
tags:
- mcore-docker-node-small
script:
- env
- export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
- export GITLAB_ENDPOINT
- python tests/test_utils/python_scripts/download_golden_values.py --pipeline-id ${CI_PIPELINE_ID}
artifacts:
paths:
- tests/
rules:
- if: $FUNCTIONAL_TEST == "yes"
when: manual
allow_failure: true
- when: never
Loading

0 comments on commit 9238a5e

Please sign in to comment.