Skip to content

Commit

Permalink
Fixed bug in end-to-end-session-based that was failing test on CI whe…
Browse files Browse the repository at this point in the history
…n only a single GPU was available (which prevented multigpu training)
  • Loading branch information
gabrielspmoreira committed Nov 3, 2023
1 parent 382c0a3 commit ffff554
Showing 1 changed file with 15 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,19 @@
"- <b>per device batch size for evaluation</b>: see above"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "c9e83d47-380c-4118-bc29-8bc108163fa0",
"metadata": {},
"outputs": [],
"source": [
"# If only 1 GPU are available, starts a single process to use that GPU\n",
"from torch.cuda import device_count\n",
"num_gpus = device_count()\n",
"NUM_PROCESSES = min(num_gpus, 2)"
]
},
{
"cell_type": "code",
"execution_count": 4,
Expand Down Expand Up @@ -502,7 +515,7 @@
"LR = float(os.environ.get(\"LEARNING_RATE\", \"0.0005\"))\n",
"BATCH_SIZE_TRAIN = int(os.environ.get(\"BATCH_SIZE_TRAIN\", \"256\"))\n",
"BATCH_SIZE_VALID = int(os.environ.get(\"BATCH_SIZE_VALID\", \"128\"))\n",
"!python -m torch.distributed.run --nproc_per_node 2 {TRAINER_FILE} --path {OUTPUT_DIR} --learning-rate {LR} --per-device-train-batch-size {BATCH_SIZE_TRAIN} --per-device-eval-batch-size {BATCH_SIZE_VALID}"
"!python -m torch.distributed.run --nproc_per_node {NUM_PROCESSES} {TRAINER_FILE} --path {OUTPUT_DIR} --learning-rate {LR} --per-device-train-batch-size {BATCH_SIZE_TRAIN} --per-device-eval-batch-size {BATCH_SIZE_VALID}"
]
},
{
Expand Down Expand Up @@ -554,7 +567,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.10.12"
}
},
"nbformat": 4,
Expand Down

0 comments on commit ffff554

Please sign in to comment.