Skip to content

Commit

Permalink
Merge branch 'master' into more-gpu-test-fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
vfdev-5 authored Dec 17, 2024
2 parents 61a2c29 + 06fe8bd commit 064cfda
Show file tree
Hide file tree
Showing 21 changed files with 278 additions and 136 deletions.
16 changes: 8 additions & 8 deletions .github/workflows/gpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ concurrency:
group: gpu-tests-${{ github.ref_name }}-${{ !(github.ref_protected) || github.sha }}
cancel-in-progress: true

# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml
# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job_v2.yml

jobs:
gpu-tests:
Expand All @@ -25,7 +25,7 @@ jobs:
pytorch-channel: [pytorch, pytorch-nightly]
fail-fast: false
env:
DOCKER_IMAGE: "pytorch/conda-builder:cuda12.1"
DOCKER_IMAGE: "pytorch/almalinux-builder:cuda12.4"
REPOSITORY: ${{ github.repository }}
PR_NUMBER: ${{ github.event.pull_request.number }}
runs-on: linux.8xlarge.nvidia.gpu
Expand All @@ -40,7 +40,7 @@ jobs:
echo "::endgroup::"
- name: Checkout repository (pytorch/test-infra)
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
# Support the use case where we need to checkout someone's fork
repository: pytorch/test-infra
Expand All @@ -55,7 +55,7 @@ jobs:
docker-image: ${{ env.DOCKER_IMAGE }}

- name: Checkout repository (${{ github.repository }})
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
# Support the use case where we need to checkout someone's fork
repository: ${{ github.repository }}
Expand Down Expand Up @@ -102,9 +102,9 @@ jobs:
# Install PyTorch
if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then
pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu121
pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu124
else
pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu121
pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124
fi
python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())"
Expand All @@ -124,7 +124,7 @@ jobs:
uses: nick-fields/retry@v2.9.0
with:
max_attempts: 5
timeout_minutes: 25
timeout_minutes: 45
shell: bash
command: docker exec -t pthd /bin/bash -xec 'bash tests/run_gpu_tests.sh 2'
new_command_on_retry: docker exec -e USE_LAST_FAILED=1 -t pthd /bin/bash -xec 'bash tests/run_gpu_tests.sh 2'
Expand All @@ -139,7 +139,7 @@ jobs:
- name: Run examples in container
continue-on-error: false
run: |
SCRIPT=$(cat << EOF
script=$(cat << EOF
set -xe
Expand Down
23 changes: 12 additions & 11 deletions .github/workflows/pytorch-version-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,25 @@ jobs:
max-parallel: 5
fail-fast: false
matrix:
# Here we keep python 3.8 tests until the end of the 2024 and
# will drop python version and related pytorch versions
python-version: [3.8, 3.9, "3.10"]
python-version: [3.9, "3.10", "3.11"]
pytorch-version:
[2.4.1, 2.3.1, 2.2.2, 2.0.1, 1.13.1, 1.12.1, 1.10.0, 1.8.1]
[2.4.1, 2.3.1, 2.2.2, 2.0.1, 1.13.1, 1.12.1, 1.10.0]
exclude:
# disabling python 3.9 support with PyTorch 1.7.1 and 1.8.1, to stop repeated pytorch-version test fail.
# https://github.com/pytorch/ignite/issues/2383
- pytorch-version: 1.8.1
python-version: 3.9
- pytorch-version: 1.8.1
python-version: "3.10"

- pytorch-version: 1.10.0
python-version: "3.10"
- pytorch-version: 1.10.0
python-version: "3.11"

- pytorch-version: 1.11.0
python-version: "3.10"
- pytorch-version: 1.11.0
python-version: "3.11"
- pytorch-version: 1.12.1
python-version: "3.11"
# Conda fails to install cpuonly version and few cpu distributed tests are
# failing with unrelated errors
- pytorch-version: 1.13.1
python-version: "3.11"

steps:
- uses: actions/checkout@v4
Expand Down
7 changes: 4 additions & 3 deletions examples/cifar10/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
import torch.nn as nn
import torch.optim as optim
import utils
from torch.cuda.amp import autocast, GradScaler
from torch.amp import autocast
from torch.cuda.amp import GradScaler

import ignite
import ignite.distributed as idist
Expand Down Expand Up @@ -299,7 +300,7 @@ def train_step(engine, batch):

model.train()

with autocast(enabled=with_amp):
with autocast("cuda", enabled=with_amp):
y_pred = model(x)
loss = criterion(y_pred, y)

Expand Down Expand Up @@ -355,7 +356,7 @@ def evaluate_step(engine: Engine, batch):
x = x.to(device, non_blocking=True)
y = y.to(device, non_blocking=True)

with autocast(enabled=with_amp):
with autocast("cuda", enabled=with_amp):
output = model(x)
return output, y

Expand Down
5 changes: 3 additions & 2 deletions examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import fire
import torch
from torch.cuda.amp import autocast, GradScaler
from torch.amp import autocast
from torch.cuda.amp import GradScaler
from torch.nn import CrossEntropyLoss
from torch.optim import SGD
from torchvision.models import wide_resnet50_2
Expand Down Expand Up @@ -34,7 +35,7 @@ def train_step(engine, batch):
optimizer.zero_grad()

# Runs the forward pass with autocasting.
with autocast():
with autocast("cuda"):
y_pred = model(x)
loss = criterion(y_pred, y)

Expand Down
5 changes: 3 additions & 2 deletions examples/cifar10_qat/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
import torch.nn as nn
import torch.optim as optim
import utils
from torch.cuda.amp import autocast, GradScaler
from torch.amp import autocast
from torch.cuda.amp import GradScaler

import ignite
import ignite.distributed as idist
Expand Down Expand Up @@ -283,7 +284,7 @@ def train_step(engine, batch):

model.train()

with autocast(enabled=with_amp):
with autocast("cuda", enabled=with_amp):
y_pred = model(x)
loss = criterion(y_pred, y)

Expand Down
5 changes: 3 additions & 2 deletions examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -887,7 +887,7 @@
"id": "JE8dLeEfIl_Z"
},
"source": [
"We will use [`torch.cuda.amp.autocast`](https://pytorch.org/docs/master/amp.html#torch.cuda.amp.autocast) and [`torch.cuda.amp.GradScaler`](https://pytorch.org/docs/master/amp.html#torch.cuda.amp.GradScaler) to perform automatic mixed precision training. Our code follows a [typical mixed precision training example](https://pytorch.org/docs/master/notes/amp_examples.html#typical-mixed-precision-training)."
"We will use [`torch.amp.autocast`](https://pytorch.org/docs/master/amp.html#torch.amp.autocast) and [`torch.cuda.amp.GradScaler`](https://pytorch.org/docs/master/amp.html#torch.cuda.amp.GradScaler) to perform automatic mixed precision training. Our code follows a [typical mixed precision training example](https://pytorch.org/docs/master/notes/amp_examples.html#typical-mixed-precision-training)."
]
},
{
Expand All @@ -896,7 +896,8 @@
"id": "vrJls4p-FRcA"
},
"source": [
"from torch.cuda.amp import autocast, GradScaler\n",
"from torch.cuda.amp import GradScaler\n",
"from torch.amp import autocast\n",
"\n",
"from ignite.utils import convert_tensor\n",
"import torch.nn.functional as F\n",
Expand Down
9 changes: 5 additions & 4 deletions examples/references/classification/imagenet/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
import torch

try:
from torch.cuda.amp import autocast, GradScaler
from torch.amp import autocast
from torch.cuda.amp import GradScaler
except ImportError:
raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.6.0")
raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.12.0")

import dataflow as data
import utils
Expand Down Expand Up @@ -144,7 +145,7 @@ def create_trainer(model, optimizer, criterion, train_sampler, config, logger, w
def training_step(engine, batch):
model.train()
x, y = prepare_batch(batch, device=device, non_blocking=True)
with autocast(enabled=with_amp):
with autocast("cuda", enabled=with_amp):
y_pred = model(x)
y_pred = model_output_transform(y_pred)
loss = criterion(y_pred, y) / accumulation_steps
Expand Down Expand Up @@ -235,7 +236,7 @@ def create_evaluator(model, metrics, config, with_clearml, tag="val"):
@torch.no_grad()
def evaluate_step(engine, batch):
model.eval()
with autocast(enabled=with_amp):
with autocast("cuda", enabled=with_amp):
x, y = prepare_batch(batch, device=config.device, non_blocking=True)
y_pred = model(x)
y_pred = model_output_transform(y_pred)
Expand Down
9 changes: 5 additions & 4 deletions examples/references/segmentation/pascal_voc2012/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
import torch

try:
from torch.cuda.amp import autocast, GradScaler
from torch.amp import autocast
from torch.cuda.amp import GradScaler
except ImportError:
raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.6.0")
raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.12.0")

import dataflow as data
import utils
Expand Down Expand Up @@ -191,7 +192,7 @@ def create_trainer(model, optimizer, criterion, train_sampler, config, logger, w
def forward_pass(batch):
model.train()
x, y = prepare_batch(batch, device=device, non_blocking=True)
with autocast(enabled=with_amp):
with autocast("cuda", enabled=with_amp):
y_pred = model(x)
y_pred = model_output_transform(y_pred)
loss = criterion(y_pred, y) / accumulation_steps
Expand Down Expand Up @@ -272,7 +273,7 @@ def create_evaluator(model, metrics, config, with_clearml, tag="val"):
@torch.no_grad()
def evaluate_step(engine, batch):
model.eval()
with autocast(enabled=with_amp):
with autocast("cuda", enabled=with_amp):
x, y = prepare_batch(batch, device=config.device, non_blocking=True)
y_pred = model(x)
y_pred = model_output_transform(y_pred)
Expand Down
7 changes: 4 additions & 3 deletions examples/transformers/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
import torch.nn as nn
import torch.optim as optim
import utils
from torch.cuda.amp import autocast, GradScaler
from torch.amp import autocast
from torch.cuda.amp import GradScaler

import ignite
import ignite.distributed as idist
Expand Down Expand Up @@ -309,7 +310,7 @@ def train_step(engine, batch):

model.train()

with autocast(enabled=with_amp):
with autocast("cuda", enabled=with_amp):
y_pred = model(input_batch)
loss = criterion(y_pred, labels)

Expand Down Expand Up @@ -373,7 +374,7 @@ def evaluate_step(engine, batch):
input_batch = {k: v.to(device, non_blocking=True, dtype=torch.long) for k, v in batch[0].items()}
labels = labels.to(device, non_blocking=True, dtype=torch.float)

with autocast(enabled=with_amp):
with autocast("cuda", enabled=with_amp):
output = model(input_batch)
return output, labels

Expand Down
2 changes: 1 addition & 1 deletion ignite/contrib/engines/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def setup_common_training_handlers(
lr_scheduler: learning rate scheduler
as native torch LRScheduler or ignite's parameter scheduler.
with_gpu_stats: if True, :class:`~ignite.metrics.GpuInfo` is attached to the
trainer. This requires `pynvml` package to be installed.
trainer. This requires `pynvml<12` package to be installed.
output_names: list of names associated with `update_function` output dictionary.
with_pbars: if True, two progress bars on epochs and optionally on iterations are attached.
Default, True.
Expand Down
12 changes: 6 additions & 6 deletions ignite/engine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,9 +185,9 @@ def supervised_training_step_amp(
"""

try:
from torch.cuda.amp import autocast
from torch.amp import autocast
except ImportError:
raise ImportError("Please install torch>=1.6.0 to use amp_mode='amp'.")
raise ImportError("Please install torch>=1.12.0 to use amp_mode='amp'.")

if gradient_accumulation_steps <= 0:
raise ValueError(
Expand All @@ -200,7 +200,7 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to
optimizer.zero_grad()
model.train()
x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
with autocast(enabled=True):
with autocast("cuda", enabled=True):
output = model_fn(model, x)
y_pred = model_transform(output)
loss = loss_fn(y_pred, y)
Expand Down Expand Up @@ -726,15 +726,15 @@ def supervised_evaluation_step_amp(
Added `model_fn` to customize model's application on the sample
"""
try:
from torch.cuda.amp import autocast
from torch.amp import autocast
except ImportError:
raise ImportError("Please install torch>=1.6.0 to use amp_mode='amp'.")
raise ImportError("Please install torch>=1.12.0 to use amp_mode='amp'.")

def evaluate_step(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[torch.Tensor]]:
model.eval()
with torch.no_grad():
x, y = prepare_batch(batch, device=device, non_blocking=non_blocking)
with autocast(enabled=True):
with autocast("cuda", enabled=True):
output = model_fn(model, x)
y_pred = model_transform(output)
return output_transform(x, y, y_pred)
Expand Down
Loading

0 comments on commit 064cfda

Please sign in to comment.