forked from mhpi/hydroDL
-
Notifications
You must be signed in to change notification settings - Fork 0
/
trainPUR-Reg.py
289 lines (253 loc) · 11.9 KB
/
trainPUR-Reg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
import sys
sys.path.append('../')
from hydroDL import master
from hydroDL.master import default
from hydroDL.data import camels
from hydroDL.model import rnn, crit, train
import json
import os
import numpy as np
import torch
import random
# Options for different interface
interfaceOpt = 1
# ==1 default, the improved and more interpretable version. It's easier to see the data flow, model setup and training
# process. Recommended for most users.
# ==0 the original "pro" version we used to run heavy jobs for the paper. It was later improved for clarity to obtain option 1.
# Results are very similar for two options and have little difference in computational performance.
Action = [1, 2]
# Using Action options to control training different models
# 1: Train Base LSTM PUR Models without integrating any soft info
# 2: Train CNN-LSTM to integrate FDCs
# Hyperparameters
EPOCH = 300
BATCH_SIZE=100
RHO=365
HIDDENSIZE=256
saveEPOCH = 10 # save model for every "saveEPOCH" epochs
Ttrain=[19851001, 19951001] # training period
LCrange = [19851001, 19951001]
# Define root directory of database and output
# Modify this based on your own location of CAMELS dataset
# Following the data download instruction in README file, you should organize the folders like
# 'your/path/to/Camels/basin_timeseries_v1p2_metForcing_obsFlow' and 'your/path/to/Camels/camels_attributes_v2.0'
# Then 'rootDatabase' here should be 'your/path/to/Camels'
# You can also define the database directory in hydroDL/__init__.py by modifying pathCamels['DB'] variable
rootDatabase = os.path.join(os.path.sep, 'scratch', 'Camels') # CAMELS dataset root directory
camels.initcamels(rootDatabase) # initialize three camels module-scope variables in camels.py: dirDB, gageDict, statDict
rootOut = os.path.join(os.path.sep, 'data', 'rnnStreamflow') # Model output root directory
# define random seed
# seedid = [159654, 109958, 257886, 142365, 229837, 588859] # six seeds randomly generated using np.random.uniform
seedid = 159654
random.seed(seedid)
torch.manual_seed(seedid)
np.random.seed(seedid)
torch.cuda.manual_seed(seedid)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# Fix seed for training, change it to have different runnings with different seeds
# We use the mean discharge of 6 runnings with different seeds to account for randomness
# directory to save training results
exp_name='PUR'
exp_disp='Testrun'
save_path = os.path.join(exp_name, exp_disp, str(seedid))
# Divide CAMELS dataset into 7 PUR regions
gageinfo = camels.gageDict
hucinfo = gageinfo['huc']
gageid = gageinfo['id']
# get the id list of each region
regionID = list()
regionNum = list()
regionDivide = [ [1,2], [3,6], [4,5,7], [9,10], [8,11,12,13], [14,15,16,18], [17] ] # seven regions
for ii in range(len(regionDivide)):
tempcomb = regionDivide[ii]
tempregid = list()
for ih in tempcomb:
tempid = gageid[hucinfo==ih].tolist()
tempregid = tempregid + tempid
regionID.append(tempregid)
regionNum.append(len(tempregid))
# Only for interfaceOpt=0 using multiple GPUs, not used here
# cid = 0 # starting GPU id
# gnum = 6 # how many GPUs you have
# Region withheld as testing target. Take region 1 as an example.
# Change this to 1,2,..,7 to run models for all 7 PUR regions in CONUS.
testRegion = 1
iexp = testRegion - 1 # index
TestLS = regionID[iexp] # basin ID list for testing, should be withheld for training
TrainLS = list(set(gageid.tolist()) - set(TestLS)) # basin ID for training
gageDic = {'TrainID': TrainLS, 'TestID': TestLS}
# prepare the training dataset
optData = default.optDataCamels
optData = default.update(optData, tRange=Ttrain, subset=TrainLS, lckernel=None, fdcopt=False)
climateList = camels.attrLstSel + ['p_mean','pet_mean','p_seasonality','frac_snow','aridity','high_prec_freq',
'high_prec_dur','low_prec_freq','low_prec_dur']
# climateList = ['slope_mean', 'area_gages2', 'frac_forest', 'soil_porosity', 'max_water_content']
# climateList = []
optData = default.update(optData, varT=camels.forcingLst, varC= climateList)
# varT: forcing used for training varC: attributes used for training
# The above controls what attributes used for training, change varC for input-selection-ensemble
# for 5 attributes model: climateList = ['slope_mean', 'area_gages2', 'frac_forest', 'soil_porosity', 'max_water_content']
# for no-attribute model: varC = []
# the input-selection ensemble represents using the mean prediction of full, 5-attr and no-attr models,
# in total the mean of 3(different attributes)*6(different random seeds) = 18 models
if interfaceOpt == 1:
# read data from CAMELS dataset
df = camels.DataframeCamels(
subset=optData['subset'], tRange=optData['tRange'])
x = df.getDataTs(
varLst=optData['varT'],
doNorm=False,
rmNan=False)
y = df.getDataObs(
doNorm=False,
rmNan=False,
basinnorm=True)
# "basinnorm = True" will call camels.basinNorm() on the original discharge data. This will transform discharge
# from ft3/s to mm/day and then divided by mean precip to be dimensionless. output = discharge/(area*mean_precip)
c = df.getDataConst(
varLst=optData['varC'],
doNorm=False,
rmNan=False)
# process, do normalization and remove nan
series_data = np.concatenate([x, y], axis=2)
seriesvarLst = camels.forcingLst + ['runoff']
# calculate statistics for normalization and save to a dictionary
statDict = camels.getStatDic(attrLst=climateList, attrdata=c, seriesLst=seriesvarLst, seriesdata=series_data)
# normalize
attr_norm = camels.transNormbyDic(c, climateList, statDict, toNorm=True)
attr_norm[np.isnan(attr_norm)] = 0.0
series_norm = camels.transNormbyDic(series_data, seriesvarLst, statDict, toNorm=True)
# prepare the inputs
xTrain = series_norm[:,:,:-1] # forcing, not include obs
xTrain[np.isnan(xTrain)] = 0.0
yTrain = np.expand_dims(series_norm[:,:,-1], 2)
if attr_norm.size == 0: # [], no-attribute case
attrs = None
Nx = xTrain.shape[-1]
else:
# with attributes
attrs=attr_norm
Nx = xTrain.shape[-1] + attrs.shape[-1]
Ny = yTrain.shape[-1]
# define loss function
optLoss = default.optLossRMSE
lossFun = crit.RmseLoss()
# configuration for training
optTrain = default.update(default.optTrainCamels, miniBatch=[BATCH_SIZE, RHO], nEpoch=EPOCH, saveEpoch=saveEPOCH, seed=seedid)
hucdic = 'Reg-'+str(iexp+1)+'-Num'+str(regionNum[iexp])
if 1 in Action:
# Train base LSTM PUR model
out = os.path.join(rootOut, save_path, hucdic,'Reg-85-95-Sub-Full')
# out = os.path.join(rootOut, save_path, hucdic,'Reg-85-95-Sub-5attr')
# out = os.path.join(rootOut, save_path, hucdic,'Reg-85-95-Sub-Noattr')
if not os.path.isdir(out):
os.makedirs(out)
# log training gage information
gageFile = os.path.join(out, 'gage.json')
with open(gageFile, 'w') as fp:
json.dump(gageDic, fp, indent=4)
# define model config
optModel = default.update(default.optLstm, name='hydroDL.model.rnn.CudnnLstmModel', hiddenSize=HIDDENSIZE)
if interfaceOpt == 1:
# define, load and train model
optModel = default.update(optModel, nx=Nx, ny=Ny)
model = rnn.CudnnLstmModel(nx=optModel['nx'], ny=optModel['ny'], hiddenSize=optModel['hiddenSize'])
# Wrap up all the training configurations to one dictionary in order to save into "out" folder
masterDict = master.wrapMaster(out, optData, optModel, optLoss, optTrain)
master.writeMasterFile(masterDict)
# log statistics
statFile = os.path.join(out, 'statDict.json')
with open(statFile, 'w') as fp:
json.dump(statDict, fp, indent=4)
# Train the model
trainedModel = train.trainModel(
model,
xTrain,
yTrain,
attrs,
lossFun,
nEpoch=EPOCH,
miniBatch=[BATCH_SIZE, RHO],
saveEpoch=saveEPOCH,
saveFolder=out)
if interfaceOpt == 0:
# Only need to pass the wrapped configuration dict 'masterDict' for training
# nx, ny will be automatically updated later
masterDict = master.wrapMaster(out, optData, optModel, optLoss, optTrain)
master.train(masterDict)
## Not used here.
## A potential way to run batch jobs simultaneously in background through multiple GPUs and Linux screens.
## To use this, must manually set the "pathCamels['DB']" in hydroDL/__init__.py as your own root path of CAMELS data.
## Use the following master.runTrain() instead of the above master.train().
# master.runTrain(masterDict, cudaID=cid % gnum, screen='test-'+str(cid))
# cid = cid + 1
if 2 in Action:
# Train CNN-LSTM PUR model to integrate FDCs
# LCrange defines from which period to get synthetic FDC
LCTstr = str(LCrange[0]) + '-' + str(LCrange[1])
out = os.path.join(rootOut, save_path, hucdic, 'Reg-85-95-Sub-Full-FDC' + LCTstr)
# out = os.path.join(rootOut, save_path, hucdic, 'Reg-85-95-Sub-5attr-FDC' + LCTstr)
# out = os.path.join(rootOut, save_path, hucdic, 'Reg-85-95-Sub-Noattr-FDC' + LCTstr)
if not os.path.isdir(out):
os.makedirs(out)
gageFile = os.path.join(out, 'gage.json')
with open(gageFile, 'w') as fp:
json.dump(gageDic, fp, indent=4)
optData = default.update(default.optDataCamels, tRange=Ttrain, subset=TrainLS,
lckernel=LCrange, fdcopt=True)
# define model
convNKS = [(10, 5, 1), (5, 3, 3), (1, 1, 1)]
# CNN parameters for 3 layers: [(Number of kernels 10,5,1), (kernel size 5,3,3), (stride 1,1,1)]
optModel = default.update(default.optCnn1dLstm, name='hydroDL.model.rnn.CNN1dLCmodel',
hiddenSize=HIDDENSIZE, convNKS=convNKS, poolOpt=[2, 2, 1]) # use CNN-LSTM model
if interfaceOpt == 1:
# load data and create synthetic FDCs as inputs
dffdc = camels.DataframeCamels(subset=optData['subset'], tRange=optData['lckernel'])
datatemp = dffdc.getDataObs(
doNorm=False, rmNan=False, basinnorm=True)
# normalize data
dadata = camels.transNormbyDic(datatemp, 'runoff', statDict, toNorm=True)
dadata = np.squeeze(dadata) # dim Nbasin*Nday
fdcdata = master.master.calFDC(dadata)
print('FDC was calculated and used!')
xIn = (xTrain, fdcdata)
# load model
Nobs = xIn[1].shape[-1]
optModel = default.update(optModel, nx=Nx, ny=Ny, nobs=Nobs) # update input dims
convpara = optModel['convNKS']
model = rnn.CNN1dLCmodel(
nx=optModel['nx'],
ny=optModel['ny'],
nobs=optModel['nobs'],
hiddenSize=optModel['hiddenSize'],
nkernel=convpara[0],
kernelSize=convpara[1],
stride=convpara[2],
poolOpt=optModel['poolOpt'])
print('CNN1d Local calibartion Kernel is used!')
# Wrap up all the training configurations to one dictionary in order to save into "out" folder
masterDict = master.wrapMaster(out, optData, optModel, optLoss, optTrain)
master.writeMasterFile(masterDict)
# log statistics
statFile = os.path.join(out, 'statDict.json')
with open(statFile, 'w') as fp:
json.dump(statDict, fp, indent=4)
# Train the model
trainedModel = train.trainModel(
model,
xIn, # need to well defined
yTrain,
attrs,
lossFun,
nEpoch=EPOCH,
miniBatch=[BATCH_SIZE, RHO],
saveEpoch=saveEPOCH,
saveFolder=out)
if interfaceOpt == 0:
# Only need to pass the wrapped configuration 'masterDict' for training
# nx, ny, nobs will be automatically updated later
masterDict = master.wrapMaster(out, optData, optModel, optLoss, optTrain)
master.train(masterDict) # train model
# master.runTrain(masterDict, cudaID=cid % gnum, screen='test-'+str(cid))
# cid = cid + 1