Skip to content
This repository has been archived by the owner on Jul 1, 2024. It is now read-only.

Commit

Permalink
Move use_gpu from ClassyTrainer to ClassificationTask (#468)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #468

This is the first in a series of diffs to eliminate the ClassyTrainer abstraction. The only reason Trainer existed was to support elastic training, but PET v0.2 does not require changing out training loop. The plan is to move all attributes from ClassyTrainer into ClassificationTask.

Start by moving use_gpu to the task.

Reviewed By: mannatsingh

Differential Revision: D20801017

fbshipit-source-id: d56bb330737a98a7ea7545d33da0ad0f21a0b6a1
  • Loading branch information
vreis authored and facebook-github-bot committed Apr 8, 2020
1 parent 6214d10 commit 7fdcffc
Show file tree
Hide file tree
Showing 14 changed files with 70 additions and 154 deletions.
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ jobs:
pip install .
classy-project my-project
pushd my-project
./classy_train.py --device cpu --config configs/template_config.json
./classy_train.py --config configs/template_config.json
popd
rm -rf my-project
Expand Down
7 changes: 1 addition & 6 deletions classy_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,18 +93,13 @@ def main(args, config):
# Configure hooks to do tensorboard logging, checkpoints and so on
task.set_hooks(configure_hooks(args, config))

use_gpu = None
if args.device is not None:
use_gpu = args.device == "gpu"
assert torch.cuda.is_available() or not use_gpu, "CUDA is unavailable"

# LocalTrainer is used for a single node. DistributedTrainer will setup
# training to use PyTorch's DistributedDataParallel.
trainer_class = {"none": LocalTrainer, "ddp": DistributedTrainer}[
args.distributed_backend
]

trainer = trainer_class(use_gpu=use_gpu, num_dataloader_workers=args.num_workers)
trainer = trainer_class(num_dataloader_workers=args.num_workers)

logging.info(
f"Starting training on rank {get_rank()} worker. "
Expand Down
13 changes: 0 additions & 13 deletions classy_vision/generic/opts.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,6 @@ def add_generic_args(parser):
parser.add_argument(
"--config_file", type=str, help="path to config file for model", required=True
)
parser.add_argument(
"--device",
default=None,
type=str,
help="device to use: either 'cpu' or 'gpu'. If unspecified, will use GPU when available and CPU otherwise.",
)
parser.add_argument(
"--num_workers",
default=4,
Expand Down Expand Up @@ -145,13 +139,6 @@ def check_generic_args(args):
# check types and values:
assert is_pos_int(args.num_workers), "incorrect number of workers"
assert is_pos_int(args.visdom_port), "incorrect visdom port"
assert (
args.device is None or args.device == "cpu" or args.device == "gpu"
), "unknown device"

# check that CUDA is available:
if args.device == "gpu":
assert torch.cuda.is_available(), "CUDA required to train on GPUs"

# create checkpoint folder if it does not exist:
if args.checkpoint_folder != "" and not os.path.exists(args.checkpoint_folder):
Expand Down
43 changes: 24 additions & 19 deletions classy_vision/tasks/classification_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,16 @@ def __init__(self):
self.perf_log = []
self.last_batch = None
self.batch_norm_sync_mode = BatchNormSyncMode.DISABLED
self.use_gpu = torch.cuda.is_available()

def set_use_gpu(self, use_gpu: bool):
self.use_gpu = use_gpu

assert (
not self.use_gpu or torch.cuda.is_available()
), "CUDA required to train on GPUs"

return self

def set_checkpoint(self, checkpoint):
"""Sets checkpoint on task.
Expand Down Expand Up @@ -359,6 +369,10 @@ def from_config(cls, config: Dict[str, Any]) -> "ClassificationTask":
.set_hooks(hooks)
)

use_gpu = config.get("use_gpu")
if use_gpu is not None:
task.set_use_gpu(use_gpu)

for phase_type in phase_types:
task.set_dataset(datasets[phase_type], phase_type)

Expand Down Expand Up @@ -508,24 +522,19 @@ def build_dataloaders(
for phase_type in self.datasets.keys()
}

def prepare(
self,
num_dataloader_workers=0,
pin_memory=False,
use_gpu=False,
dataloader_mp_context=None,
):
def prepare(self, num_dataloader_workers=0, dataloader_mp_context=None):
"""Prepares task for training, populates all derived attributes
Args:
num_dataloader_workers: Number of dataloading processes. If 0,
dataloading is done on main process
pin_memory: if true pin memory on GPU
use_gpu: if true, load model, optimizer, loss, etc on GPU
dataloader_mp_context: Determines how processes are spawned.
Value must be one of None, "spawn", "fork", "forkserver".
If None, then context is inherited from parent process
"""

pin_memory = self.use_gpu and torch.cuda.device_count() > 1

self.phases = self._build_phases()
self.dataloaders = self.build_dataloaders(
num_workers=num_dataloader_workers,
Expand All @@ -539,7 +548,7 @@ def prepare(
self.base_model = apex.parallel.convert_syncbn_model(self.base_model)

# move the model and loss to the right device
if use_gpu:
if self.use_gpu:
self.base_model, self.loss = copy_model_to_gpu(self.base_model, self.loss)
else:
self.loss.cpu()
Expand Down Expand Up @@ -686,7 +695,7 @@ def set_classy_state(self, state):
# Set up pytorch module in train vs eval mode, update optimizer.
self._set_model_train_mode()

def eval_step(self, use_gpu):
def eval_step(self):
self.last_batch = None

# Process next sample
Expand All @@ -699,7 +708,7 @@ def eval_step(self, use_gpu):

# Copy sample to GPU
target = sample["target"]
if use_gpu:
if self.use_gpu:
for key, value in sample.items():
sample[key] = recursive_copy_to_gpu(value, non_blocking=True)

Expand All @@ -726,12 +735,8 @@ def check_inf_nan(self, loss):
if loss == float("inf") or loss == float("-inf") or loss != loss:
raise FloatingPointError(f"Loss is infinity or NaN: {loss}")

def train_step(self, use_gpu):
"""Train step to be executed in train loop
Args:
use_gpu: if true, execute training on GPU
"""
def train_step(self):
"""Train step to be executed in train loop."""

self.last_batch = None

Expand All @@ -745,7 +750,7 @@ def train_step(self, use_gpu):

# Copy sample to GPU
target = sample["target"]
if use_gpu:
if self.use_gpu:
for key, value in sample.items():
sample[key] = recursive_copy_to_gpu(value, non_blocking=True)

Expand Down
23 changes: 6 additions & 17 deletions classy_vision/tasks/classy_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,7 @@ def set_classy_state(self, state):

@abstractmethod
def prepare(
self,
num_dataloader_workers=0,
pin_memory=False,
use_gpu=False,
dataloader_mp_context=None,
self, num_dataloader_workers=0, dataloader_mp_context=None
) -> None:
"""
Prepares the task for training.
Expand All @@ -102,19 +98,15 @@ def prepare(
num_dataloader_workers: Number of workers to create for the dataloaders
pin_memory: Whether the dataloaders should copy the Tensors into CUDA
pinned memory (default False)
use_gpu: True if training on GPUs, False otherwise
"""
pass

@abstractmethod
def train_step(self, use_gpu) -> None:
def train_step(self) -> None:
"""
Run a train step.
This corresponds to training over one batch of data from the dataloaders.
Args:
use_gpu: True if training on GPUs, False otherwise
"""
pass

Expand Down Expand Up @@ -155,24 +147,21 @@ def on_end(self):
pass

@abstractmethod
def eval_step(self, use_gpu) -> None:
def eval_step(self) -> None:
"""
Run an evaluation step.
This corresponds to evaluating the model over one batch of data.
Args:
use_gpu: True if training on GPUs, False otherwise
"""
pass

def step(self, use_gpu) -> None:
def step(self) -> None:
from classy_vision.hooks import ClassyHookFunctions

if self.train:
self.train_step(use_gpu)
self.train_step()
else:
self.eval_step(use_gpu)
self.eval_step()

for hook in self.hooks:
hook.on_step(self)
Expand Down
10 changes: 2 additions & 8 deletions classy_vision/tasks/fine_tuning_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,18 +67,12 @@ def _set_model_train_mode(self):
self.base_model.train(phase["train"])

def prepare(
self,
num_dataloader_workers: int = 0,
pin_memory: bool = False,
use_gpu: bool = False,
dataloader_mp_context=None,
self, num_dataloader_workers: int = 0, dataloader_mp_context=None
) -> None:
assert (
self.pretrained_checkpoint is not None
), "Need a pretrained checkpoint for fine tuning"
super().prepare(
num_dataloader_workers, pin_memory, use_gpu, dataloader_mp_context
)
super().prepare(num_dataloader_workers, dataloader_mp_context)
if self.checkpoint is None:
# no checkpoint exists, load the model's state from the pretrained
# checkpoint
Expand Down
12 changes: 1 addition & 11 deletions classy_vision/trainer/classy_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,25 +27,18 @@ class ClassyTrainer:

def __init__(
self,
use_gpu: Optional[bool] = None,
num_dataloader_workers: int = 0,
dataloader_mp_context: Optional[str] = None,
):
"""Constructor for ClassyTrainer.
Args:
use_gpu: If true, then use GPUs for training.
If None, then check if we have GPUs available, if we do
then use GPU for training.
num_dataloader_workers: Number of CPU processes doing dataloading
per GPU. If 0, then dataloading is done on main thread.
dataloader_mp_context: Determines how to launch
new processes for dataloading. Must be one of "fork", "forkserver",
"spawn". If None, process launching is inherited from parent.
"""
if use_gpu is None:
use_gpu = torch.cuda.is_available()
self.use_gpu = use_gpu
self.num_dataloader_workers = num_dataloader_workers
self.dataloader_mp_context = dataloader_mp_context

Expand All @@ -57,11 +50,8 @@ def train(self, task: ClassyTask):
everything that is needed for training
"""

pin_memory = self.use_gpu and torch.cuda.device_count() > 1
task.prepare(
num_dataloader_workers=self.num_dataloader_workers,
pin_memory=pin_memory,
use_gpu=self.use_gpu,
dataloader_mp_context=self.dataloader_mp_context,
)
assert isinstance(task, ClassyTask)
Expand All @@ -75,7 +65,7 @@ def train(self, task: ClassyTask):
task.on_phase_start()
while True:
try:
task.step(self.use_gpu)
task.step()
except StopIteration:
break
task.on_phase_end()
Expand Down
30 changes: 5 additions & 25 deletions classy_vision/trainer/distributed_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,39 +56,19 @@ class DistributedTrainer(ClassyTrainer):
"""Distributed trainer for using multiple training processes
"""

def __init__(
self,
use_gpu: Optional[bool] = None,
num_dataloader_workers: int = 0,
dataloader_mp_context: Optional[str] = None,
):
"""Constructor for DistributedTrainer.
Args:
use_gpu: If true, then use GPU 0 for training.
If None, then check if we have GPUs available, if we do
then use GPU for training.
num_dataloader_workers: Number of CPU processes doing dataloading
per GPU. If 0, then dataloading is done on main thread.
dataloader_mp_context: Determines how to launch
new processes for dataloading. Must be one of "fork", "forkserver",
"spawn". If None, process launching is inherited from parent.
"""
super().__init__(
use_gpu=use_gpu,
num_dataloader_workers=num_dataloader_workers,
dataloader_mp_context=dataloader_mp_context,
)
def train(self, task):
_init_env_vars()
_init_distributed(self.use_gpu)
_init_distributed(task.use_gpu)
logging.info(
f"Done setting up distributed process_group with rank {get_rank()}"
+ f", world_size {get_world_size()}"
)
local_rank = int(os.environ["LOCAL_RANK"])
if self.use_gpu:
if task.use_gpu:
logging.info("Using GPU, CUDA device index: {}".format(local_rank))
set_cuda_device_index(local_rank)
else:
logging.info("Using CPU")
set_cpu_device()

super().train(task)
28 changes: 4 additions & 24 deletions classy_vision/trainer/local_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,32 +16,12 @@ class LocalTrainer(ClassyTrainer):
"""Trainer to be used if you want want use only a single training process.
"""

def __init__(
self,
use_gpu: Optional[bool] = None,
num_dataloader_workers: int = 0,
dataloader_mp_context: Optional[str] = None,
):
"""Constructor for LocalTrainer.
Args:
use_gpu: If true, then use GPU 0 for training.
If None, then check if we have GPUs available, if we do
then use GPU for training.
num_dataloader_workers: Number of CPU processes doing dataloading
per GPU. If 0, then dataloading is done on main thread.
dataloader_mp_context: Determines how to launch
new processes for dataloading. Must be one of "fork", "forkserver",
"spawn". If None, process launching is inherited from parent.
"""
super().__init__(
use_gpu=use_gpu,
num_dataloader_workers=num_dataloader_workers,
dataloader_mp_context=dataloader_mp_context,
)
if self.use_gpu:
def train(self, task):
if task.use_gpu:
logging.info("Using GPU, CUDA device index: {}".format(0))
set_cuda_device_index(0)
else:
logging.info("Using CPU")
set_cpu_device()

super().train(task)
Loading

0 comments on commit 7fdcffc

Please sign in to comment.