Skip to content
This repository has been archived by the owner on Jul 1, 2024. It is now read-only.

Commit

Permalink
Remove local_variables from train_step/eval_step (#412)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #412

This is part of a series of diffs to eliminate local_variables (see D20171981).
Now that we've removed local_variables from step, remove it
from train_step, eval_step.

Reviewed By: mannatsingh

Differential Revision: D20170006

fbshipit-source-id: a5c6525424fc89711de40b8b6906b44c8aa608f4
  • Loading branch information
vreis authored and facebook-github-bot committed Mar 3, 2020
1 parent e29cc72 commit 841b439
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 78 deletions.
90 changes: 26 additions & 64 deletions classy_vision/tasks/classification_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,120 +638,85 @@ def set_classy_state(self, state):
# Set up pytorch module in train vs eval mode, update optimizer.
self._set_model_train_mode()

def eval_step(self, use_gpu, local_variables=None):
if local_variables is None:
local_variables = {}

def eval_step(self, use_gpu):
self.last_batch = None

# Process next sample
sample = next(self.get_data_iterator())
local_variables["sample"] = sample

assert (
isinstance(local_variables["sample"], dict)
and "input" in local_variables["sample"]
and "target" in local_variables["sample"]
), (
assert isinstance(sample, dict) and "input" in sample and "target" in sample, (
f"Returned sample [{sample}] is not a map with 'input' and"
+ "'target' keys"
)

# Copy sample to GPU
local_variables["target"] = local_variables["sample"]["target"]
target = sample["target"]
if use_gpu:
for key, value in local_variables["sample"].items():
local_variables["sample"][key] = recursive_copy_to_gpu(
value, non_blocking=True
)
for key, value in sample.items():
sample[key] = recursive_copy_to_gpu(value, non_blocking=True)

with torch.no_grad():
local_variables["output"] = self.model(local_variables["sample"]["input"])
output = self.model(sample["input"])

local_variables["local_loss"] = self.compute_loss(
local_variables["output"], local_variables["sample"]
)
local_loss = self.compute_loss(output, sample)

local_variables["loss"] = local_variables["local_loss"].detach().clone()
local_variables["loss"] = all_reduce_mean(local_variables["loss"])
loss = local_loss.detach().clone()
loss = all_reduce_mean(loss)

self.losses.append(
local_variables["loss"].data.cpu().item()
* local_variables["target"].size(0)
)
self.losses.append(loss.data.cpu().item() * target.size(0))

self.update_meters(local_variables["output"], local_variables["sample"])
self.update_meters(output, sample)

# Move some data to the task so hooks get a chance to access it
self.last_batch = LastBatchInfo(
loss=local_variables["loss"],
output=local_variables["output"],
target=local_variables["target"],
sample=local_variables["sample"],
loss=loss, output=output, target=target, sample=sample
)

def train_step(self, use_gpu, local_variables=None):
def train_step(self, use_gpu):
"""Train step to be executed in train loop
Args:
use_gpu: if true, execute training on GPU
local_variables: Dict containing intermediate values
in train_step for access by hooks
"""

if local_variables is None:
local_variables = {}

self.last_batch = None

# Process next sample
sample = next(self.get_data_iterator())
local_variables["sample"] = sample

assert (
isinstance(local_variables["sample"], dict)
and "input" in local_variables["sample"]
and "target" in local_variables["sample"]
), (
assert isinstance(sample, dict) and "input" in sample and "target" in sample, (
f"Returned sample [{sample}] is not a map with 'input' and"
+ "'target' keys"
)

# Copy sample to GPU
local_variables["target"] = local_variables["sample"]["target"]
target = sample["target"]
if use_gpu:
for key, value in local_variables["sample"].items():
local_variables["sample"][key] = recursive_copy_to_gpu(
value, non_blocking=True
)
for key, value in sample.items():
sample[key] = recursive_copy_to_gpu(value, non_blocking=True)

with torch.enable_grad():
# Forward pass
local_variables["output"] = self.model(local_variables["sample"]["input"])
output = self.model(sample["input"])

local_variables["local_loss"] = self.compute_loss(
local_variables["output"], local_variables["sample"]
)
local_loss = self.compute_loss(output, sample)

local_variables["loss"] = local_variables["local_loss"].detach().clone()
local_variables["loss"] = all_reduce_mean(local_variables["loss"])
loss = local_loss.detach().clone()
loss = all_reduce_mean(loss)

self.losses.append(
local_variables["loss"].data.cpu().item()
* local_variables["target"].size(0)
)
self.losses.append(loss.data.cpu().item() * target.size(0))

self.update_meters(local_variables["output"], local_variables["sample"])
self.update_meters(output, sample)

# Run backwards pass / update optimizer
if self.amp_opt_level is not None:
self.optimizer.zero_grad()
with apex.amp.scale_loss(
local_variables["local_loss"], self.optimizer.optimizer
local_loss, self.optimizer.optimizer
) as scaled_loss:
scaled_loss.backward()
else:
self.optimizer.backward(local_variables["local_loss"])
self.optimizer.backward(local_loss)

self.optimizer.update_schedule_on_step(self.where)
self.optimizer.step()
Expand All @@ -760,10 +725,7 @@ def train_step(self, use_gpu, local_variables=None):

# Move some data to the task so hooks get a chance to access it
self.last_batch = LastBatchInfo(
loss=local_variables["loss"],
output=local_variables["output"],
target=local_variables["target"],
sample=local_variables["sample"],
loss=loss, output=output, target=target, sample=sample
)

def compute_loss(self, model_output, sample):
Expand Down
14 changes: 5 additions & 9 deletions classy_vision/tasks/classy_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,16 +107,14 @@ def prepare(
pass

@abstractmethod
def train_step(self, use_gpu, local_variables: Optional[Dict] = None) -> None:
def train_step(self, use_gpu) -> None:
"""
Run a train step.
This corresponds to training over one batch of data from the dataloaders.
Args:
use_gpu: True if training on GPUs, False otherwise
local_variables: Local variables created in the function. Can be passed to
custom :class:`classy_vision.hooks.ClassyHook`.
"""
pass

Expand Down Expand Up @@ -157,26 +155,24 @@ def on_end(self, local_variables):
pass

@abstractmethod
def eval_step(self, use_gpu, local_variables: Optional[Dict] = None) -> None:
def eval_step(self, use_gpu) -> None:
"""
Run an evaluation step.
This corresponds to evaluating the model over one batch of data.
Args:
use_gpu: True if training on GPUs, False otherwise
local_variables: Local variables created in the function. Can be passed to
custom :class:`classy_vision.hooks.ClassyHook`.
"""
pass

def step(self, use_gpu, local_variables: Optional[Dict] = None) -> None:
def step(self, use_gpu) -> None:
from classy_vision.hooks import ClassyHookFunctions

if self.train:
self.train_step(use_gpu, local_variables)
self.train_step(use_gpu)
else:
self.eval_step(use_gpu, local_variables)
self.eval_step(use_gpu)

for hook in self.hooks:
hook.on_step(self)
Expand Down
2 changes: 1 addition & 1 deletion classy_vision/trainer/classy_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def train(self, task: ClassyTask):
task.on_phase_start(local_variables)
while True:
try:
task.step(self.use_gpu, local_variables)
task.step(self.use_gpu)
except StopIteration:
break
task.on_phase_end(local_variables)
Expand Down
2 changes: 1 addition & 1 deletion classy_vision/trainer/elastic_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def _run_step(self, state, local_variables, use_gpu):
state.advance_to_next_phase = True
state.skip_current_phase = False # Reset flag
else:
state.task.step(use_gpu, local_variables)
state.task.step(use_gpu)
except StopIteration:
state.advance_to_next_phase = True

Expand Down
5 changes: 2 additions & 3 deletions test/tasks_classification_task_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ def test_checkpointing(self):
task_2 = build_task(config).set_hooks([LossLrMeterLoggingHook()])

use_gpu = torch.cuda.is_available()
local_variables = {}

# prepare the tasks for the right device
task.prepare(use_gpu=use_gpu)
Expand All @@ -96,8 +95,8 @@ def test_checkpointing(self):

# test that the train step runs the same way on both states
# and the loss remains the same
task.train_step(use_gpu, local_variables)
task_2.train_step(use_gpu, local_variables)
task.train_step(use_gpu)
task_2.train_step(use_gpu)
self._compare_states(task.get_classy_state(), task_2.get_classy_state())

def test_final_train_checkpoint(self):
Expand Down

0 comments on commit 841b439

Please sign in to comment.