From 7fd89e8025ad8196aceefea4017fd216cf7fedcd Mon Sep 17 00:00:00 2001 From: Tobias de Jong Date: Tue, 2 Jul 2024 10:47:34 +0200 Subject: [PATCH 1/6] Use combine_nested. Not tested. --- monetio/models/hysplit.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py index f2199ac9..e77749c1 100644 --- a/monetio/models/hysplit.py +++ b/monetio/models/hysplit.py @@ -563,6 +563,7 @@ class and need to be converted to a python # Safety valve - will not allow more than 1000 loops to be executed. imax = 1e8 testf = True + timedslist = [] while testf: hdata6 = np.fromfile(fid, dtype=rec6, count=1) hdata7 = np.fromfile(fid, dtype=rec6, count=1) @@ -575,8 +576,10 @@ class and need to be converted to a python # datelist = [] inc_iii = False # LOOP to go through each level + lvldslist = [] for _ in range(self.atthash["Number of Levels"]): # LOOP to go through each pollutant + poldslist = [] for _ in range(self.atthash["Number of Species"]): # record 8a has the number of elements (ne). If number of # elements greater than 0 than there are concentrations. @@ -613,17 +616,15 @@ class and need to be converted to a python # print("Adding ", "Pollutant", pollutant, "Level", lev) # if this is the first time through. create dataframe # for first level and pollutant. - if not self.dset.any(): - self.dset = dset - else: # create dataframe for level and pollutant and + poldslist += [dset] + #if not self.dset.any(): + #self.dset = dset + #else: # create dataframe for level and pollutant and # then merge with main dataframe. - # self.dset = xr.concat([self.dset, dset],'levels') - # self.dset = xr.merge([self.dset, dset],compat='override') - self.dset = xr.merge([self.dset, dset]) - # self.dset = xr.combine_by_coords([self.dset, dset]) - # self.dset = xr.merge([self.dset, dset], compat='override') + # self.dset = xr.merge([self.dset, dset]) iimax += 1 # END LOOP to go through each pollutant + lvldslist += [poldslist] # END LOOP to go through each level # safety check - will stop sampling time while loop if goes over # imax iterations. @@ -632,11 +633,16 @@ class and need to be converted to a python print("greater than imax", testf, iimax, imax) if inc_iii: iii += 1 + timedslist += [lvldslist] self.atthash.update(self.gridhash) self.atthash["Species ID"] = list(set(self.atthash["Species ID"])) self.atthash["Coordinate time description"] = "Beginning of sampling time" # END OF Loop to go through each sampling time + #print(dsets[0], dsets[1], dsets[-1], '\n\n\n') + self.dset = xr.combine_nested(timedslist, concat_dim=['time', 'z', None]) + print(self.dset) + #self.dset = xr.merge(dsets, compat='override') if not self.dset.any(): return False if self.dset.variables: From f21545efad350aa799c0e7bce49c4e686f8c9472 Mon Sep 17 00:00:00 2001 From: Tobias de Jong Date: Tue, 2 Jul 2024 13:24:55 +0200 Subject: [PATCH 2/6] Update to work with multiple tracers. Speed is 15x. Tested against output of hysplit.v5.2.2 --- monetio/models/hysplit.py | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py index e77749c1..027e3f3f 100644 --- a/monetio/models/hysplit.py +++ b/monetio/models/hysplit.py @@ -576,11 +576,11 @@ class and need to be converted to a python # datelist = [] inc_iii = False # LOOP to go through each level - lvldslist = [] - for _ in range(self.atthash["Number of Levels"]): + poldslist = [] + for _ in range(self.atthash["Number of Species"]): # LOOP to go through each pollutant - poldslist = [] - for _ in range(self.atthash["Number of Species"]): + lvldslist = [] + for _ in range(self.atthash["Number of Levels"]): # record 8a has the number of elements (ne). If number of # elements greater than 0 than there are concentrations. hdata8a = np.fromfile(fid, dtype=rec8a, count=1) @@ -614,17 +614,11 @@ class and need to be converted to a python dset = xr.Dataset.from_dataframe(concframe) # if verbose: # print("Adding ", "Pollutant", pollutant, "Level", lev) - # if this is the first time through. create dataframe - # for first level and pollutant. - poldslist += [dset] - #if not self.dset.any(): - #self.dset = dset - #else: # create dataframe for level and pollutant and - # then merge with main dataframe. - # self.dset = xr.merge([self.dset, dset]) + lvldslist += [dset] iimax += 1 # END LOOP to go through each pollutant - lvldslist += [poldslist] + #lvldslist = xr.concat(lvldslist, 'z') + poldslist += [lvldslist] # END LOOP to go through each level # safety check - will stop sampling time while loop if goes over # imax iterations. @@ -633,16 +627,16 @@ class and need to be converted to a python print("greater than imax", testf, iimax, imax) if inc_iii: iii += 1 - timedslist += [lvldslist] - + timedslist += [poldslist] + # END OF Loop to go through each sampling time self.atthash.update(self.gridhash) self.atthash["Species ID"] = list(set(self.atthash["Species ID"])) self.atthash["Coordinate time description"] = "Beginning of sampling time" - # END OF Loop to go through each sampling time - #print(dsets[0], dsets[1], dsets[-1], '\n\n\n') - self.dset = xr.combine_nested(timedslist, concat_dim=['time', 'z', None]) - print(self.dset) - #self.dset = xr.merge(dsets, compat='override') + + Ns = range(self.atthash["Number of Species"]) + dsets = [[ll[n] for ll in timedslist] for n in Ns] + dsets = [xr.combine_nested(ds, concat_dim=['time', 'z']) for ds in dsets] + self.dset = xr.merge(dsets) if not self.dset.any(): return False if self.dset.variables: From 8975252f3f63df99669228953a8b904a0b4e5a30 Mon Sep 17 00:00:00 2001 From: Tobias de Jong Date: Tue, 2 Jul 2024 15:56:00 +0200 Subject: [PATCH 3/6] Properly handle empty columns --- monetio/models/hysplit.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py index 027e3f3f..ee5341bb 100644 --- a/monetio/models/hysplit.py +++ b/monetio/models/hysplit.py @@ -617,8 +617,9 @@ class and need to be converted to a python lvldslist += [dset] iimax += 1 # END LOOP to go through each pollutant - #lvldslist = xr.concat(lvldslist, 'z') - poldslist += [lvldslist] + if len(lvldslist) > 0: + lvldslist = xr.concat(lvldslist, 'z') + poldslist += [lvldslist] # END LOOP to go through each level # safety check - will stop sampling time while loop if goes over # imax iterations. @@ -627,7 +628,8 @@ class and need to be converted to a python print("greater than imax", testf, iimax, imax) if inc_iii: iii += 1 - timedslist += [poldslist] + if len(poldslist) > 0: + timedslist += [poldslist] # END OF Loop to go through each sampling time self.atthash.update(self.gridhash) self.atthash["Species ID"] = list(set(self.atthash["Species ID"])) @@ -635,7 +637,8 @@ class and need to be converted to a python Ns = range(self.atthash["Number of Species"]) dsets = [[ll[n] for ll in timedslist] for n in Ns] - dsets = [xr.combine_nested(ds, concat_dim=['time', 'z']) for ds in dsets] + #dsets = [xr.combine_nested(ds, concat_dim=['time', 'z']) for ds in dsets] + dsets = [xr.concat(ds, dim='time') for ds in dsets] self.dset = xr.merge(dsets) if not self.dset.any(): return False From f10dbe760cfc6b5b41a133e2392bc3ac5c7686f4 Mon Sep 17 00:00:00 2001 From: Tobias de Jong Date: Wed, 17 Jul 2024 12:50:43 +0200 Subject: [PATCH 4/6] Handle the case of no particles of a tracer in some timesteps --- monetio/models/hysplit.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py index ee5341bb..fb859486 100644 --- a/monetio/models/hysplit.py +++ b/monetio/models/hysplit.py @@ -620,6 +620,8 @@ class and need to be converted to a python if len(lvldslist) > 0: lvldslist = xr.concat(lvldslist, 'z') poldslist += [lvldslist] + else: + poldslist += [None] # END LOOP to go through each level # safety check - will stop sampling time while loop if goes over # imax iterations. @@ -636,8 +638,8 @@ class and need to be converted to a python self.atthash["Coordinate time description"] = "Beginning of sampling time" Ns = range(self.atthash["Number of Species"]) - dsets = [[ll[n] for ll in timedslist] for n in Ns] - #dsets = [xr.combine_nested(ds, concat_dim=['time', 'z']) for ds in dsets] + # Grab per species all relevant datasets in the time list + dsets = [[ll[n] for ll in timedslist if ll[n] is not None] for n in Ns] dsets = [xr.concat(ds, dim='time') for ds in dsets] self.dset = xr.merge(dsets) if not self.dset.any(): From 7fd9f08250cd16f4885d234ec8bbf55c5a637abe Mon Sep 17 00:00:00 2001 From: Tobias de Jong Date: Wed, 17 Jul 2024 14:28:07 +0200 Subject: [PATCH 5/6] Organise and further speedup --- monetio/models/hysplit.py | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py index fb859486..5711a23c 100644 --- a/monetio/models/hysplit.py +++ b/monetio/models/hysplit.py @@ -455,12 +455,13 @@ def parse_hdata8(hdata8a, hdata8b, pdate1): names = ["y" if x == "jndx" else x for x in names] names = ["x" if x == "indx" else x for x in names] names = ["z" if x == "levels" else x for x in names] + names = [col_name if x == "conc" else x for x in names] concframe.columns = names - concframe.set_index( - ["time", "z", "y", "x"], - inplace=True, - ) - concframe.rename(columns={"conc": col_name}, inplace=True) + # concframe.set_index( + # ["time", "z", "y", "x"], + # inplace=True, + # ) + #concframe.rename(columns={"conc": col_name}, inplace=True) # mgrid = np.meshgrid(lat, lon) return concframe @@ -575,11 +576,11 @@ class and need to be converted to a python print("sample time", pdate1, " to ", pdate2) # datelist = [] inc_iii = False - # LOOP to go through each level + # LOOP to go through each pollutant poldslist = [] for _ in range(self.atthash["Number of Species"]): - # LOOP to go through each pollutant - lvldslist = [] + # LOOP to go through each level + concframes = [] for _ in range(self.atthash["Number of Levels"]): # record 8a has the number of elements (ne). If number of # elements greater than 0 than there are concentrations. @@ -611,18 +612,22 @@ class and need to be converted to a python concframe = self.parse_hdata8(hdata8a, hdata8b, pdate2) else: concframe = self.parse_hdata8(hdata8a, hdata8b, pdate1) - dset = xr.Dataset.from_dataframe(concframe) + concframes += [concframe] # if verbose: # print("Adding ", "Pollutant", pollutant, "Level", lev) - lvldslist += [dset] iimax += 1 - # END LOOP to go through each pollutant - if len(lvldslist) > 0: - lvldslist = xr.concat(lvldslist, 'z') - poldslist += [lvldslist] + # END LOOP to go through each level + if len(concframes) > 0: + concframes = pd.concat(concframes) + concframes.set_index( + ["time", "z", "y", "x"], + inplace=True, + ) + dset = xr.Dataset.from_dataframe(concframes) + poldslist += [dset] else: poldslist += [None] - # END LOOP to go through each level + # END LOOP to go through each pollutant # safety check - will stop sampling time while loop if goes over # imax iterations. if iimax > imax: From 25d725c28ac211d178eceefc54dc99effca390a3 Mon Sep 17 00:00:00 2001 From: Tobias de Jong Date: Thu, 1 Aug 2024 15:02:10 +0200 Subject: [PATCH 6/6] Handle tracers fully outside the grid --- monetio/models/hysplit.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py index 5711a23c..fd57edd2 100644 --- a/monetio/models/hysplit.py +++ b/monetio/models/hysplit.py @@ -645,10 +645,12 @@ class and need to be converted to a python Ns = range(self.atthash["Number of Species"]) # Grab per species all relevant datasets in the time list dsets = [[ll[n] for ll in timedslist if ll[n] is not None] for n in Ns] - dsets = [xr.concat(ds, dim='time') for ds in dsets] - self.dset = xr.merge(dsets) - if not self.dset.any(): + dsets = [xr.concat(ds, dim='time') for ds in dsets if len(ds) > 0] + if len(dsets) == 0: return False + self.dset = xr.merge(dsets) + # if not self.dset.any(): + # return False if self.dset.variables: self.dset.attrs = self.atthash # mgrid = self.makegrid(self.dset.coords["x"], self.dset.coords["y"])