Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move data to the device once #61

Merged
merged 2 commits into from
Dec 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion hydragnn/models/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def create_model(
timer.start()
torch.manual_seed(0)

_, device = get_device(use_gpu, verbosity_level=verbosity)
device = get_device(use_gpu, verbosity_level=verbosity)

if model_type == "GIN":
model = GINStack(
Expand Down
4 changes: 4 additions & 0 deletions hydragnn/preprocess/serialized_dataset_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from torch_geometric.transforms import RadiusGraph, Distance

from .dataset_descriptors import AtomFeatures
from hydragnn.utils.distributed import get_device
from hydragnn.utils.print_utils import print_distributed, iterate_tqdm


Expand Down Expand Up @@ -81,7 +82,10 @@ def load_serialized_data(self, dataset_path: str, config):
for data in dataset:
data.edge_attr = data.edge_attr / max_edge_length

# Move data to the device, if used. # FIXME: this does not respect the choice set by use_gpu
device = get_device(verbosity_level=self.verbosity)
for data in dataset:
data.to(device)
self.__update_predicted_values(
config["Variables_of_interest"]["type"],
config["Variables_of_interest"]["output_index"],
Expand Down
8 changes: 0 additions & 8 deletions hydragnn/train/train_validate_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,15 +190,13 @@ def get_head_indices(model, data):


def train(loader, model, opt, verbosity):
device = next(model.parameters()).device
tasks_error = np.zeros(model.num_heads)
tasks_noderr = np.zeros(model.num_heads)

model.train()

total_error = 0
for data in iterate_tqdm(loader, verbosity):
data = data.to(device)
opt.zero_grad()
head_index = get_head_indices(model, data)

Expand All @@ -222,14 +220,11 @@ def train(loader, model, opt, verbosity):
@torch.no_grad()
def validate(loader, model, verbosity):

device = next(model.parameters()).device

total_error = 0
tasks_error = np.zeros(model.num_heads)
tasks_noderr = np.zeros(model.num_heads)
model.eval()
for data in iterate_tqdm(loader, verbosity):
data = data.to(device)
head_index = get_head_indices(model, data)

pred = model(data)
Expand All @@ -249,8 +244,6 @@ def validate(loader, model, verbosity):
@torch.no_grad()
def test(loader, model, verbosity):

device = next(model.parameters()).device

total_error = 0
tasks_error = np.zeros(model.num_heads)
tasks_noderr = np.zeros(model.num_heads)
Expand All @@ -265,7 +258,6 @@ def test(loader, model, verbosity):
for ihead in range(model.num_heads)
]
for data in iterate_tqdm(loader, verbosity):
data = data.to(device)
head_index = get_head_indices(model, data)

pred = model(data)
Expand Down
2 changes: 2 additions & 0 deletions hydragnn/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
get_comm_size_and_rank,
get_device_list,
get_device,
get_device_name,
get_device_from_name,
is_model_distributed,
get_distributed_model,
)
Expand Down
20 changes: 16 additions & 4 deletions hydragnn/utils/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,12 +132,12 @@ def get_device_list():
return available_gpus


def get_device(use_gpu=True, rank_per_model=1, verbosity_level=0):
def get_device_name(use_gpu=True, rank_per_model=1, verbosity_level=0):

available_gpus = get_device_list()
if not use_gpu or not available_gpus:
print_distributed(verbosity_level, "Using CPU")
return "cpu", torch.device("cpu")
return "cpu"

world_size, world_rank = get_comm_size_and_rank()
if rank_per_model != 1:
Expand All @@ -162,19 +162,31 @@ def get_device(use_gpu=True, rank_per_model=1, verbosity_level=0):

device_name = "cuda:" + str(localrank)

return device_name, torch.device(device_name)
return device_name


def get_device_from_name(name: str):

return torch.device(name)


def get_device(use_gpu=True, rank_per_model=1, verbosity_level=0):

name = get_device_name(use_gpu, rank_per_model, verbosity_level)
return get_device_from_name(name)


def is_model_distributed(model):
return isinstance(model, torch.nn.parallel.distributed.DistributedDataParallel)


def get_distributed_model(model, verbosity=0):
device_name, device = get_device(verbosity)
device_name = get_device_name(verbosity)
if dist.is_initialized():
if device_name == "cpu":
model = torch.nn.parallel.DistributedDataParallel(model)
else:
device = get_device_from_name(device_name)
model = torch.nn.parallel.DistributedDataParallel(
model, device_ids=[device]
)
Expand Down
1 change: 0 additions & 1 deletion hydragnn/utils/time_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ def stop(self):
self.elapsed_time = time.perf_counter() - self.start_time
self.start_time = None

_, self.device = get_device()
self.tmin = torch.Tensor([self.elapsed_time]).to(self.device)
self.tmax = torch.Tensor([self.elapsed_time]).to(self.device)
self.tavg = torch.Tensor([self.elapsed_time]).to(self.device)
Expand Down