-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtrain.py
119 lines (92 loc) · 4.47 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
import numpy as np
import random
import time
import torch
from torch.utils.data import DataLoader
def nll_gaussian(mean, logvar, value):
"""Compute negative log-likelihood of Gaussian."""
assert mean.size() == logvar.size() == value.size()
pi = torch.FloatTensor([np.pi]).to(value.device)
nll_element = (value - mean).pow(2) / torch.exp(logvar) + logvar + torch.log(2*pi)
return torch.sum(0.5*nll_element)
def run_epoch(dataloader, model, optimizer, device, train=True):
"""Perform one epoch of training by looping through the dataset once."""
# Setting models and datasets into train/test mode
if train:
model = model.train()
dataloader.dataset.train()
else:
model = model.eval()
dataloader.dataset.test()
nll_total = 0.0
for batch_idx, (batch,_) in enumerate(dataloader):
assert isinstance(batch, torch.Tensor)
batch = batch.to(device) # batch is preseismic timeseries
# Compute loss (negative log-likelihood)
pred_means, pred_logvars = model(batch,generate_dpm=False) # this calls model.forward()
nll_batch = nll_gaussian(pred_means, pred_logvars, batch) # compute negative log-likelihood (NLL) of batch under predicted Gaussians
if train:
optimizer.zero_grad()
nll_batch.backward() # compute gradients
torch.nn.utils.clip_grad_norm_(model.parameters(), 10) # clips norm of gradients to 10
optimizer.step() # one step of gradient descent
nll_total += nll_batch.item() # .item() remove gradient information, which is more memory efficient
nll_average = nll_total / len(dataloader.dataset) # average NLL per sequence
print('{}\t| nll: {:.6f}'.format('TRAIN' if train else 'TEST', nll_average))
return nll_average
def train_model(train_config, model, dataset, device, save_dir):
assert 'batch_size' in train_config and isinstance(train_config['batch_size'], int) and train_config['batch_size'] > 0
assert 'num_epochs' in train_config and isinstance(train_config['num_epochs'], int) and train_config['num_epochs'] > 0
assert 'learning_rate' in train_config and train_config['learning_rate'] > 0.0
seed = train_config['seed']
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.cuda.manual_seed_all(seed)
torch.cuda.manual_seed(seed)
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False
# torch.set_deterministic(True)
# torch.backends.cudnn.enabled = False
# Initialize dataloaders
# See documentation at https://pytorch.org/docs/stable/data.html
dataloader = DataLoader(dataset, batch_size=train_config['batch_size'], shuffle=True,worker_init_fn=np.random.seed(seed),num_workers=0) # set batch_size here
# Initialize optimizer (default using ADAM optimizer)
optimizer = torch.optim.Adam(model.parameters(), lr=train_config['learning_rate']) # set learning_rate here
# Initialize bookkeeping variables
log = []
best_test_epoch = 0
best_test_loss = float('inf')
start_time = time.time()
for epoch in range(train_config['num_epochs']):
print('--- EPOCH [{}/{}] ---'.format(epoch+1, train_config['num_epochs']))
epoch_start_time = time.time()
train_loss = run_epoch(dataloader, model, optimizer, device, train=True)
test_loss = run_epoch(dataloader, model, optimizer, device, train=False)
epoch_time = time.time() - epoch_start_time
print('{:.3f} seconds'.format(epoch_time))
log.append({
'epoch' : epoch+1,
'train_loss' : train_loss,
'test_loss' : test_loss,
'time' : epoch_time
})
# Save model with best test loss
if test_loss < best_test_loss:
best_test_loss = test_loss
best_test_epoch = epoch+1
torch.save(model.state_dict(), os.path.join(save_dir, 'best_model.pth'))
print('BEST test loss')
# Save final model
torch.save(model.state_dict(), os.path.join(save_dir, 'final_model.pth'))
print('--- DONE training model ---')
# Compute summary statistics
summary = {
'total_time': round(time.time()-start_time, 3),
'average_epoch_time': round((time.time()-start_time)/train_config['num_epochs'], 3),
'best_test_loss': best_test_loss,
'best_test_epoch': best_test_epoch,
'num_trainable_params': model.num_parameters()
}
return train_config, summary, log