From 7fd89e8025ad8196aceefea4017fd216cf7fedcd Mon Sep 17 00:00:00 2001
From: Tobias de Jong <t.a.de.jong@sron.nl>
Date: Tue, 2 Jul 2024 10:47:34 +0200
Subject: [PATCH 1/6] Use combine_nested. Not tested.

---
 monetio/models/hysplit.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py
index f2199ac9..e77749c1 100644
--- a/monetio/models/hysplit.py
+++ b/monetio/models/hysplit.py
@@ -563,6 +563,7 @@ class and need to be converted to a python
         # Safety valve - will not allow more than 1000 loops to be executed.
         imax = 1e8
         testf = True
+        timedslist = []
         while testf:
             hdata6 = np.fromfile(fid, dtype=rec6, count=1)
             hdata7 = np.fromfile(fid, dtype=rec6, count=1)
@@ -575,8 +576,10 @@ class and need to be converted to a python
             # datelist = []
             inc_iii = False
             # LOOP to go through each level
+            lvldslist = []
             for _ in range(self.atthash["Number of Levels"]):
                 # LOOP to go through each pollutant
+                poldslist = []
                 for _ in range(self.atthash["Number of Species"]):
                     # record 8a has the number of elements (ne). If number of
                     # elements greater than 0 than there are concentrations.
@@ -613,17 +616,15 @@ class and need to be converted to a python
                         #    print("Adding ", "Pollutant", pollutant, "Level", lev)
                         # if this is the first time through. create dataframe
                         # for first level and pollutant.
-                        if not self.dset.any():
-                            self.dset = dset
-                        else:  # create dataframe for level and pollutant and
+                        poldslist += [dset]
+                        #if not self.dset.any():
+                            #self.dset = dset
+                        #else:  # create dataframe for level and pollutant and
                             # then merge with main dataframe.
-                            # self.dset = xr.concat([self.dset, dset],'levels')
-                            # self.dset = xr.merge([self.dset, dset],compat='override')
-                            self.dset = xr.merge([self.dset, dset])
-                            # self.dset = xr.combine_by_coords([self.dset, dset])
-                            # self.dset = xr.merge([self.dset, dset], compat='override')
+                            # self.dset = xr.merge([self.dset, dset])
                         iimax += 1
                 # END LOOP to go through each pollutant
+                lvldslist += [poldslist]
             # END LOOP to go through each level
             # safety check - will stop sampling time while loop if goes over
             #  imax iterations.
@@ -632,11 +633,16 @@ class and need to be converted to a python
                 print("greater than imax", testf, iimax, imax)
             if inc_iii:
                 iii += 1
+            timedslist += [lvldslist]
 
         self.atthash.update(self.gridhash)
         self.atthash["Species ID"] = list(set(self.atthash["Species ID"]))
         self.atthash["Coordinate time description"] = "Beginning of sampling time"
         # END OF Loop to go through each sampling time
+        #print(dsets[0], dsets[1], dsets[-1], '\n\n\n')
+        self.dset = xr.combine_nested(timedslist, concat_dim=['time', 'z', None])
+        print(self.dset)
+        #self.dset = xr.merge(dsets, compat='override')
         if not self.dset.any():
             return False
         if self.dset.variables:

From f21545efad350aa799c0e7bce49c4e686f8c9472 Mon Sep 17 00:00:00 2001
From: Tobias de Jong <t.a.de.jong@sron.nl>
Date: Tue, 2 Jul 2024 13:24:55 +0200
Subject: [PATCH 2/6] Update to work with multiple tracers. Speed is 15x.

Tested against output of hysplit.v5.2.2
---
 monetio/models/hysplit.py | 34 ++++++++++++++--------------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py
index e77749c1..027e3f3f 100644
--- a/monetio/models/hysplit.py
+++ b/monetio/models/hysplit.py
@@ -576,11 +576,11 @@ class and need to be converted to a python
             # datelist = []
             inc_iii = False
             # LOOP to go through each level
-            lvldslist = []
-            for _ in range(self.atthash["Number of Levels"]):
+            poldslist = []
+            for _ in range(self.atthash["Number of Species"]):
                 # LOOP to go through each pollutant
-                poldslist = []
-                for _ in range(self.atthash["Number of Species"]):
+                lvldslist = []
+                for _ in range(self.atthash["Number of Levels"]):
                     # record 8a has the number of elements (ne). If number of
                     # elements greater than 0 than there are concentrations.
                     hdata8a = np.fromfile(fid, dtype=rec8a, count=1)
@@ -614,17 +614,11 @@ class and need to be converted to a python
                         dset = xr.Dataset.from_dataframe(concframe)
                         # if verbose:
                         #    print("Adding ", "Pollutant", pollutant, "Level", lev)
-                        # if this is the first time through. create dataframe
-                        # for first level and pollutant.
-                        poldslist += [dset]
-                        #if not self.dset.any():
-                            #self.dset = dset
-                        #else:  # create dataframe for level and pollutant and
-                            # then merge with main dataframe.
-                            # self.dset = xr.merge([self.dset, dset])
+                        lvldslist += [dset]
                         iimax += 1
                 # END LOOP to go through each pollutant
-                lvldslist += [poldslist]
+                #lvldslist = xr.concat(lvldslist, 'z')
+                poldslist += [lvldslist]
             # END LOOP to go through each level
             # safety check - will stop sampling time while loop if goes over
             #  imax iterations.
@@ -633,16 +627,16 @@ class and need to be converted to a python
                 print("greater than imax", testf, iimax, imax)
             if inc_iii:
                 iii += 1
-            timedslist += [lvldslist]
-
+            timedslist += [poldslist]
+        # END OF Loop to go through each sampling time
         self.atthash.update(self.gridhash)
         self.atthash["Species ID"] = list(set(self.atthash["Species ID"]))
         self.atthash["Coordinate time description"] = "Beginning of sampling time"
-        # END OF Loop to go through each sampling time
-        #print(dsets[0], dsets[1], dsets[-1], '\n\n\n')
-        self.dset = xr.combine_nested(timedslist, concat_dim=['time', 'z', None])
-        print(self.dset)
-        #self.dset = xr.merge(dsets, compat='override')
+
+        Ns = range(self.atthash["Number of Species"])
+        dsets = [[ll[n] for ll in timedslist] for n in Ns]
+        dsets = [xr.combine_nested(ds, concat_dim=['time', 'z']) for ds in dsets]
+        self.dset = xr.merge(dsets)
         if not self.dset.any():
             return False
         if self.dset.variables:

From 8975252f3f63df99669228953a8b904a0b4e5a30 Mon Sep 17 00:00:00 2001
From: Tobias de Jong <t.a.de.jong@sron.nl>
Date: Tue, 2 Jul 2024 15:56:00 +0200
Subject: [PATCH 3/6] Properly handle empty columns

---
 monetio/models/hysplit.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py
index 027e3f3f..ee5341bb 100644
--- a/monetio/models/hysplit.py
+++ b/monetio/models/hysplit.py
@@ -617,8 +617,9 @@ class and need to be converted to a python
                         lvldslist += [dset]
                         iimax += 1
                 # END LOOP to go through each pollutant
-                #lvldslist = xr.concat(lvldslist, 'z')
-                poldslist += [lvldslist]
+                if len(lvldslist) > 0:
+                    lvldslist = xr.concat(lvldslist, 'z')
+                    poldslist += [lvldslist]
             # END LOOP to go through each level
             # safety check - will stop sampling time while loop if goes over
             #  imax iterations.
@@ -627,7 +628,8 @@ class and need to be converted to a python
                 print("greater than imax", testf, iimax, imax)
             if inc_iii:
                 iii += 1
-            timedslist += [poldslist]
+            if len(poldslist) > 0:
+                timedslist += [poldslist]
         # END OF Loop to go through each sampling time
         self.atthash.update(self.gridhash)
         self.atthash["Species ID"] = list(set(self.atthash["Species ID"]))
@@ -635,7 +637,8 @@ class and need to be converted to a python
 
         Ns = range(self.atthash["Number of Species"])
         dsets = [[ll[n] for ll in timedslist] for n in Ns]
-        dsets = [xr.combine_nested(ds, concat_dim=['time', 'z']) for ds in dsets]
+        #dsets = [xr.combine_nested(ds, concat_dim=['time', 'z']) for ds in dsets]
+        dsets = [xr.concat(ds, dim='time') for ds in dsets]
         self.dset = xr.merge(dsets)
         if not self.dset.any():
             return False

From f10dbe760cfc6b5b41a133e2392bc3ac5c7686f4 Mon Sep 17 00:00:00 2001
From: Tobias de Jong <t.a.de.jong@sron.nl>
Date: Wed, 17 Jul 2024 12:50:43 +0200
Subject: [PATCH 4/6] Handle the case of no particles of a tracer in some
 timesteps

---
 monetio/models/hysplit.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py
index ee5341bb..fb859486 100644
--- a/monetio/models/hysplit.py
+++ b/monetio/models/hysplit.py
@@ -620,6 +620,8 @@ class and need to be converted to a python
                 if len(lvldslist) > 0:
                     lvldslist = xr.concat(lvldslist, 'z')
                     poldslist += [lvldslist]
+                else:
+                    poldslist += [None]
             # END LOOP to go through each level
             # safety check - will stop sampling time while loop if goes over
             #  imax iterations.
@@ -636,8 +638,8 @@ class and need to be converted to a python
         self.atthash["Coordinate time description"] = "Beginning of sampling time"
 
         Ns = range(self.atthash["Number of Species"])
-        dsets = [[ll[n] for ll in timedslist] for n in Ns]
-        #dsets = [xr.combine_nested(ds, concat_dim=['time', 'z']) for ds in dsets]
+        # Grab per species all relevant datasets in the time list
+        dsets = [[ll[n] for ll in timedslist if ll[n] is not None] for n in Ns]
         dsets = [xr.concat(ds, dim='time') for ds in dsets]
         self.dset = xr.merge(dsets)
         if not self.dset.any():

From 7fd9f08250cd16f4885d234ec8bbf55c5a637abe Mon Sep 17 00:00:00 2001
From: Tobias de Jong <t.a.de.jong@sron.nl>
Date: Wed, 17 Jul 2024 14:28:07 +0200
Subject: [PATCH 5/6] Organise and further speedup

---
 monetio/models/hysplit.py | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py
index fb859486..5711a23c 100644
--- a/monetio/models/hysplit.py
+++ b/monetio/models/hysplit.py
@@ -455,12 +455,13 @@ def parse_hdata8(hdata8a, hdata8b, pdate1):
         names = ["y" if x == "jndx" else x for x in names]
         names = ["x" if x == "indx" else x for x in names]
         names = ["z" if x == "levels" else x for x in names]
+        names = [col_name if x == "conc" else x for x in names]
         concframe.columns = names
-        concframe.set_index(
-            ["time", "z", "y", "x"],
-            inplace=True,
-        )
-        concframe.rename(columns={"conc": col_name}, inplace=True)
+        # concframe.set_index(
+        #     ["time", "z", "y", "x"],
+        #     inplace=True,
+        # )
+        #concframe.rename(columns={"conc": col_name}, inplace=True)
         # mgrid = np.meshgrid(lat, lon)
         return concframe
 
@@ -575,11 +576,11 @@ class and need to be converted to a python
                 print("sample time", pdate1, " to ", pdate2)
             # datelist = []
             inc_iii = False
-            # LOOP to go through each level
+            # LOOP to go through each pollutant
             poldslist = []
             for _ in range(self.atthash["Number of Species"]):
-                # LOOP to go through each pollutant
-                lvldslist = []
+                # LOOP to go through each level
+                concframes = []
                 for _ in range(self.atthash["Number of Levels"]):
                     # record 8a has the number of elements (ne). If number of
                     # elements greater than 0 than there are concentrations.
@@ -611,18 +612,22 @@ class and need to be converted to a python
                             concframe = self.parse_hdata8(hdata8a, hdata8b, pdate2)
                         else:
                             concframe = self.parse_hdata8(hdata8a, hdata8b, pdate1)
-                        dset = xr.Dataset.from_dataframe(concframe)
+                        concframes += [concframe]
                         # if verbose:
                         #    print("Adding ", "Pollutant", pollutant, "Level", lev)
-                        lvldslist += [dset]
                         iimax += 1
-                # END LOOP to go through each pollutant
-                if len(lvldslist) > 0:
-                    lvldslist = xr.concat(lvldslist, 'z')
-                    poldslist += [lvldslist]
+                # END LOOP to go through each level
+                if len(concframes) > 0:
+                    concframes = pd.concat(concframes)
+                    concframes.set_index(
+                                    ["time", "z", "y", "x"],
+                                    inplace=True,
+                    )
+                    dset = xr.Dataset.from_dataframe(concframes)
+                    poldslist += [dset]
                 else:
                     poldslist += [None]
-            # END LOOP to go through each level
+            # END LOOP to go through each pollutant
             # safety check - will stop sampling time while loop if goes over
             #  imax iterations.
             if iimax > imax:

From 25d725c28ac211d178eceefc54dc99effca390a3 Mon Sep 17 00:00:00 2001
From: Tobias de Jong <t.a.de.jong@sron.nl>
Date: Thu, 1 Aug 2024 15:02:10 +0200
Subject: [PATCH 6/6] Handle tracers fully outside the grid

---
 monetio/models/hysplit.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/monetio/models/hysplit.py b/monetio/models/hysplit.py
index 5711a23c..fd57edd2 100644
--- a/monetio/models/hysplit.py
+++ b/monetio/models/hysplit.py
@@ -645,10 +645,12 @@ class and need to be converted to a python
         Ns = range(self.atthash["Number of Species"])
         # Grab per species all relevant datasets in the time list
         dsets = [[ll[n] for ll in timedslist if ll[n] is not None] for n in Ns]
-        dsets = [xr.concat(ds, dim='time') for ds in dsets]
-        self.dset = xr.merge(dsets)
-        if not self.dset.any():
+        dsets = [xr.concat(ds, dim='time') for ds in dsets if len(ds) > 0]
+        if len(dsets) == 0:
             return False
+        self.dset = xr.merge(dsets)
+        # if not self.dset.any():
+        #     return False
         if self.dset.variables:
             self.dset.attrs = self.atthash
             # mgrid = self.makegrid(self.dset.coords["x"], self.dset.coords["y"])