tobac-project · freemansw1 · Oct 21, 2022 · Oct 21, 2022 · Oct 21, 2022 · Oct 21, 2022
diff --git a/doc/big_datasets.rst b/doc/big_datasets.rst
@@ -0,0 +1,11 @@
+Handling Large Datasets
+-------------------------------------
+
+Often, one desires to use *tobac* to identify and track features in large datasets ("big data"). This documentation strives to suggest various methods for doing so efficiently. Current versions of *tobac* do not allow for out-of-memory computation, meaning that these strategies may need to be employed for both computational and memory reasons. 
+
+.. _Split Feature Detection:
+=======================
+Split Feature Detection
+=======================
+Current versions of threshold feature detection (see :doc:`feature_detection_overview`) are time independent, meaning that one can parallelize feature detection across all times (although not across space). *tobac* provides the :py:meth:`tobac.utils.combine_tobac_feats` function to combine a list of dataframes produced by a parallelization method (such as :code:`jug` or :code:`multiprocessing.pool`) into a single combined dataframe suitable to perform tracking with. 
+
diff --git a/doc/index.rst b/doc/index.rst
@@ -23,7 +23,8 @@ The project is currently being extended by several contributors to include addit
    installation
    data_input
    analysis 
-   plotting 
+   plotting
+   big_datasets 
    examples
    publications
 

diff --git a/tobac/tests/test_utils.py b/tobac/tests/test_utils.py
@@ -1,5 +1,11 @@
-import numpy as np
+import datetime
+
 import tobac.utils as tb_utils
+import tobac.testing as tb_test
+
+import pandas as pd
+import pandas.testing as pd_test
+import numpy as np
 from scipy import fft
 
 
@@ -55,3 +61,34 @@ def test_spectral_filtering():
         )
         >= 1
     )
+
+
+def test_combine_tobac_feats():
+    """tests tobac.utils.combine_tobac_feats
+    Test by generating two single feature dataframes,
+    combining them with this function, and then
+    testing to see if a single dataframe
+    matches.
+    """
+
+    single_feat_1 = tb_test.generate_single_feature(
+        0, 0, start_date=datetime.datetime(2022, 1, 1, 0, 0), frame_start=0
+    )
+    single_feat_2 = tb_test.generate_single_feature(
+        1, 1, start_date=datetime.datetime(2022, 1, 1, 0, 5), frame_start=0
+    )
+
+    combined_feat = tb_utils.combine_tobac_feats([single_feat_1, single_feat_2])
+
+    tot_feat = tb_test.generate_single_feature(
+        0, 0, spd_h1=1, spd_h2=1, num_frames=2, frame_start=0
+    )
+
+    pd_test.assert_frame_equal(combined_feat, tot_feat)
+
+    # Now try preserving the old feature numbers.
+    combined_feat = tb_utils.combine_tobac_feats(
+        [single_feat_1, single_feat_2], preserve_old_feat_nums="old_feat_column"
+    )
+    assert np.all(list(combined_feat["old_feat_column"].values) == [1, 1])
+    assert np.all(list(combined_feat["feature"].values) == [1, 2])
diff --git a/tobac/utils.py b/tobac/utils.py
@@ -894,3 +894,53 @@ def spectral_filtering(
         return (lambda_mn, transfer_function), filtered_field
     else:
         return filtered_field
+
+
+def combine_tobac_feats(list_of_feats, preserve_old_feat_nums=None):
+    """Function to combine a list of tobac feature detection dataframes
+    into one combined dataframe that can be used for tracking
+    or segmentation.
+
+    Parameters
+    ----------
+    list_of_feats: array-like of Pandas DataFrames
+        A list of dataframes (generated, for example, by
+        running feature detection on multiple nodes).
+
+    preserve_old_feat_nums: str or None
+        The column name to preserve old feature numbers in. If None, these
+        old numbers will be deleted. Users may want to enable this feature
+        if they have run segmentation with the separate dataframes and
+        therefore old feature numbers.
+
+    Returns
+    -------
+    pd.DataFrame
+        One combined DataFrame.
+
+    """
+    import pandas as pd
+    import numpy as np
+
+    # first, let's just combine these.
+    combined_df = pd.concat(list_of_feats)
+    # Then, sort by time first, then by feature number
+    combined_df = combined_df.sort_values(["time", "feature"])
+    all_times = sorted(combined_df["time"].unique())
+    # Loop through current times
+    start_feat_num = combined_df["feature"].min()
+    # Save the old feature numbers if requested.
+    if preserve_old_feat_nums is not None:
+        combined_df[preserve_old_feat_nums] = combined_df["feature"]
+
+    for frame_num, curr_time in enumerate(all_times):
+        # renumber the frame number
+        combined_df.loc[combined_df["time"] == curr_time, "frame"] = frame_num
+        # renumber the features
+        curr_row_count = len(combined_df.loc[combined_df["time"] == curr_time])
+        feat_num_arr = np.arange(start_feat_num, start_feat_num + curr_row_count)
+        combined_df.loc[combined_df["time"] == curr_time, "feature"] = feat_num_arr
+        start_feat_num = np.max(feat_num_arr) + 1
+
+    combined_df = combined_df.reset_index(drop=True)
+    return combined_df