Fix read json

DOI-USGS · Jan 9, 2024 · d5b19a8 · d5b19a8
1 parent 3f164e8
commit d5b19a8
Showing 1 changed file with 74 additions and 52 deletions.
diff --git a/dataretrieval/nwis.py b/dataretrieval/nwis.py
@@ -1306,60 +1306,82 @@ def _read_json(json):
     """
     merged_df = pd.DataFrame(columns=['site_no', 'datetime'])
 
-    for timeseries in json['value']['timeSeries']:
-        site_no = timeseries['sourceInfo']['siteCode'][0]['value']
-        param_cd = timeseries['variable']['variableCode'][0]['value']
-        # check whether min, max, mean record XXX
-        option = timeseries['variable']['options']['option'][0].get('value')
-
-        # loop through each parameter in timeseries.
-        for parameter in timeseries['values']:
-            col_name = param_cd
-            method = parameter['method'][0]['methodDescription']
-
-            # if len(timeseries['values']) > 1 and method:
-            if method:
-                # get method, format it, and append to column name
-                method = method.strip('[]()').lower()
-                col_name = f'{col_name}_{method}'
-
-            if option:
-                col_name = f'{col_name}_{option}'
-
-            record_json = parameter['value']
-
-            if not record_json:
-                # no data in record
-                continue
-            # should be able to avoid this by dumping
-            record_json = str(record_json).replace("'", '"')
-
-            # read json, converting all values to float64 and all qualifiers
-            # Lists can't be hashed, thus we cannot df.merge on a list column
-            record_df = pd.read_json(
-                StringIO(record_json),
-                orient='records',
-                dtype={'value': 'float64', 'qualifiers': 'unicode'},
-                convert_dates=False,
-            )
+    site_list = [
+        ts['sourceInfo']['siteCode'][0]['value'] for ts in json['value']['timeSeries']
+    ]
 
-            record_df['qualifiers'] = (
-                record_df['qualifiers'].str.strip('[]').str.replace("'", '')
-            )
-            record_df['site_no'] = site_no
-
-            record_df.rename(
-                columns={
-                    'value': col_name,
-                    'dateTime': 'datetime',
-                    'qualifiers': col_name + '_cd',
-                },
-                inplace=True,
-            )
+    # create a list of indexes for each change in site no
+    # for example, [0, 21, 22] would be the first and last indeces
+    index_list = [0]
+    index_list.extend(
+        [i + 1 for i, (a, b) in enumerate(zip(site_list[:-1], site_list[1:])) if a != b]
+    )
+    index_list.append(len(site_list))
+
+    for i in range(len(index_list) - 1):
+        start = index_list[i]  # [0]
+        end = index_list[i + 1]  # [21]
+
+        # grab a block containing timeseries 0:21,
+        # which are all from the same site
+        site_block = json['value']['timeSeries'][start:end]
+        site_no = site_block[0]['sourceInfo']['siteCode'][0]['value']
+        site_df = pd.DataFrame(columns=['datetime'])
+
+        for timeseries in site_block:
+            param_cd = timeseries['variable']['variableCode'][0]['value']
+            # check whether min, max, mean record XXX
+            option = timeseries['variable']['options']['option'][0].get('value')
+
+            # loop through each parameter in timeseries, then concat to the merged_df
+            for parameter in timeseries['values']:
+                col_name = param_cd
+                method = parameter['method'][0]['methodDescription']
+
+                # if len(timeseries['values']) > 1 and method:
+                if method:
+                    # get method, format it, and append to column name
+                    method = method.strip('[]()').lower()
+                    col_name = f'{col_name}_{method}'
+
+                if option:
+                    col_name = f'{col_name}_{option}'
+
+                record_json = parameter['value']
+
+                if not record_json:
+                    # no data in record
+                    continue
+                # should be able to avoid this by dumping
+                record_json = str(record_json).replace("'", '"')
+
+                # read json, converting all values to float64 and all qualifiers
+                # Lists can't be hashed, thus we cannot df.merge on a list column
+                record_df = pd.read_json(
+                    StringIO(record_json),
+                    orient='records',
+                    dtype={'value': 'float64', 'qualifiers': 'unicode'},
+                    convert_dates=False,
+                )
 
-            merged_df = merged_df.merge(
-                record_df, how='outer', on=['site_no', 'datetime']
-            )
+                record_df['qualifiers'] = (
+                    record_df['qualifiers'].str.strip('[]').str.replace("'", '')
+                )
+
+                record_df.rename(
+                    columns={
+                        'value': col_name,
+                        'dateTime': 'datetime',
+                        'qualifiers': col_name + '_cd',
+                    },
+                    inplace=True,
+                )
+
+                site_df = site_df.merge(record_df, how='outer', on='datetime')
+
+        # end of site loop
+        site_df['site_no'] = site_no
+        merged_df = pd.concat([merged_df, site_df])
 
     # convert to datetime, normalizing the timezone to UTC when doing so
     if 'datetime' in merged_df.columns: