Some changes to HFile.select() and SingleDetTriggers (gwastro#4545)

* Some changes to HFile.select() and mask_to_n_loudest_clustered * missed comma * Some CC problemas * Update * revert changes which meant kwargs couldnt be passed to mask_to_n_loudest_clustered_events * CC / fixes * Use list rather than numpy array for mask * remove messing about with the mask dtyping * initialise a transparent mask if it is not given * Fix problem when the filter value may not be available to the HFile * thinko in pycbc_bin_trigger_rates_dq * Don't make a huge boolean mask of all True values and then run nonzero on it = BAD IDEA * CC * re-enable option to have no mask * efficiency saving in pycbc_plot_hist * no mask option in trig_dict * typo * allow None maks in mask_size * thought i could apply another mask but I cant * get filter_func back, use new interface for recent changes * Some minor fixes * cc * allow option to do a logical and on masks * various minor bugfixes * max z problem * fixes to pycbc_page_snglinfo * use alternative dict for beter clarity * IWH comments * add check for required datasets (mostly to remind people to add the required dataset list if they add a new ranking\!) * fix bad check, remove min_snr option * remove option from CI * Fix bug with applying premask and using filter_rank threshold at the same time * stop checking for required datasets - this is difficult when so may different types use the same function * CC prefers np.flatnonzero(arr) to arr.nonzero()[0] * Fix a couple of rarely-used scripts, remove unused imports, whitespace
acorreia61201 · Apr 4, 2024 · be1a69c · be1a69c
1 parent 64d7f08
commit be1a69c
Show file tree

Hide file tree

Showing 15 changed files with 468 additions and 291 deletions.
diff --git a/bin/all_sky_search/pycbc_bin_trigger_rates_dq b/bin/all_sky_search/pycbc_bin_trigger_rates_dq
@@ -15,7 +15,7 @@ from pycbc.events.veto import (select_segments_by_definer,
                                start_end_to_segments,
                                segments_to_start_end)
 from pycbc.types.optparse import MultiDetOptionAction
-from pycbc.io.hdf import HFile, SingleDetTriggers
+from pycbc.io.hdf import SingleDetTriggers
 from pycbc.version import git_verbose_msg as version
 
 parser = argparse.ArgumentParser(description=__doc__)
@@ -46,79 +46,25 @@ logging.info('Start')
 
 ifo, flag_name = args.flag_name.split(':')
 
-# Setup a data mask to remove any triggers with SNR below threshold
-# This works as a pre-filter as SNR is always greater than or equal
-# to sngl_ranking, except in the psdvar case, where it could increase.
-
-with HFile(args.trig_file, 'r') as trig_file:
-    n_triggers_orig = trig_file[f'{ifo}/snr'].size
-    logging.info("Trigger file has %d triggers", n_triggers_orig)
-    logging.info('Generating trigger mask')
-    if f'{ifo}/psd_var_val' in trig_file:
-        idx, _, _ = trig_file.select(
-            lambda snr, psdvar: snr / psdvar ** 0.5 >= args.stat_threshold,
-            f'{ifo}/snr',
-            f'{ifo}/psd_var_val',
-            return_indices=True
-        )
-    else:
-        # psd_var_val may not have been calculated
-        idx, _ = trig_file.select(
-            lambda snr: snr >= args.stat_threshold,
-            f'{ifo}/snr',
-            return_indices=True
-        )
-    data_mask = np.zeros(n_triggers_orig, dtype=bool)
-    data_mask[idx] = True
-
-    # also get the gated times while we have the file open
-    gate_times = []
-    if args.gating_windows:
-        logging.info('Getting gated times')
-        try:
-            gating_types = trig_file[f'{ifo}/gating'].keys()
-            for gt in gating_types:
-                gate_times += list(trig_file[f'{ifo}/gating/{gt}/time'][:])
-            gate_times = np.unique(gate_times)
-        except KeyError:
-            logging.warning('No gating found in trigger file')
-
-logging.info("Getting %s triggers from file with pre-cut SNR > %.3f",
-             idx.size, args.stat_threshold)
-
 trigs = SingleDetTriggers(
     args.trig_file,
-    None,
-    None,
-    None,
-    None,
     ifo,
-    premask=data_mask
+    filter_rank=args.sngl_ranking,
+    filter_threshold=args.stat_threshold,
 )
 
 # Extract the data we actually need from the data structure:
 tmplt_ids = trigs.template_id
 trig_times = trigs.end_time
 stat = trigs.get_ranking(args.sngl_ranking)
 
-n_triggers = tmplt_ids.size
-
-logging.info("Applying %s > %.3f cut", args.sngl_ranking,
-             args.stat_threshold)
-keep = stat >= args.stat_threshold
-tmplt_ids = tmplt_ids[keep]
-trig_times = trig_times[keep]
-logging.info("Removed %d triggers, %d remain",
-             n_triggers - tmplt_ids.size, tmplt_ids.size)
-
 # Get the template bins
 bin_tids_dict = {}
 with h5.File(args.template_bins_file, 'r') as f:
     ifo_grp = f[ifo]
     for bin_name in ifo_grp.keys():
         bin_tids_dict[bin_name] = ifo_grp[bin_name]['tids'][:]
 
-
 # get analysis segments
 analysis_segs = select_segments_by_definer(
     args.analysis_segment_file,

diff --git a/bin/all_sky_search/pycbc_fit_sngls_split_binned b/bin/all_sky_search/pycbc_fit_sngls_split_binned
@@ -22,7 +22,7 @@ from matplotlib import pyplot as plt
 import numpy as np
 
 from pycbc import events, bin_utils, results
-from pycbc.io import HFile, SingleDetTriggers
+from pycbc.io import SingleDetTriggers
 from pycbc.events import triggers as trigs
 from pycbc.events import trigger_fits as trstats
 from pycbc.events import stat as pystat
@@ -202,48 +202,19 @@ boundaries = trigf[args.ifo + '/template_boundaries'][:]
 
 trigf.close()
 
-# Setup a data mask to remove any triggers with SNR below threshold
-# This works as a pre-filter as SNR is always greater than or equal
-# to sngl_ranking, except in the psdvar case, where it could increase.
-with HFile(args.trigger_file, 'r') as trig_file:
-    n_triggers_orig = trig_file[f'{args.ifo}/snr'].size
-    logging.info("Trigger file has %d triggers", n_triggers_orig)
-    logging.info('Generating trigger mask')
-    if f'{args.ifo}/psd_var_val' in trig_file:
-        idx, _, _ = trig_file.select(
-            lambda snr, psdvar: snr / psdvar ** 0.5 >= args.plot_lower_stat_limit,
-            f'{args.ifo}/snr',
-            f'{args.ifo}/psd_var_val',
-            return_indices=True
-        )
-    else:
-        # psd_var_val may not have been calculated
-        idx, _ = trig_file.select(
-            lambda snr: snr >= args.plot_lower_stat_limit,
-            f'{args.ifo}/snr',
-            return_indices=True
-        )
-    data_mask = np.zeros(n_triggers_orig, dtype=bool)
-    data_mask[idx] = True
-
-
 logging.info('Calculating single stat values from trigger file')
 trigs = SingleDetTriggers(
     args.trigger_file,
-    None,
-    None,
-    None,
-    None,
     args.ifo,
-    premask=data_mask
+    filter_rank=args.sngl_ranking,
+    filter_threshold=args.plot_lower_stat_limit,
 )
 # This is the direct pointer to the HDF file, used later on
 trigf = trigs.trigs_f
 
 stat = trigs.get_ranking(args.sngl_ranking)
 time = trigs.end_time
 
-
 logging.info('Processing template boundaries')
 max_boundary_id = np.argmax(boundaries)
 sorted_boundary_list = np.sort(boundaries)

diff --git a/bin/minifollowups/pycbc_injection_minifollowup b/bin/minifollowups/pycbc_injection_minifollowup
@@ -244,15 +244,17 @@ def nearby_missedinj(endtime, snr):
 trigger_idx = {}
 trigger_snrs = {}
 trigger_times = {}
+# This finds the triggers near to _any_ missed injection
 for trig in single_triggers:
     ifo = trig.ifo
     with HFile(trig.lfn, 'r') as trig_f:
-        trigger_idx[ifo], trigger_times[ifo], trigger_snrs[ifo] = \
+        trigger_idx[ifo], data_tuple = \
             trig_f.select(
                 nearby_missedinj,
                 f'{ifo}/end_time',
                 f'{ifo}/snr',
-                return_indices=True)
+            )
+    trigger_times[ifo], trigger_snrs[ifo] = data_tuple
 
 # figure out how many injections to follow up
 num_events = int(workflow.cp.get_opt_tags(
@@ -305,7 +307,7 @@ for num_event in range(num_events):
                       tags=args.tags + [str(num_event)])[0],)]
 
     for sngl in single_triggers:
-        # Find the triggers close to this injection at this IFO
+        # Find the triggers close to _this_ injection at this IFO
         ifo = sngl.ifo
         trig_tdiff = abs(inj_params[ifo + '_end_time'] - trigger_times[ifo])
         nearby = trig_tdiff < args.nearby_triggers_window
@@ -315,7 +317,7 @@ for num_event in range(num_events):
             continue
         # Find the loudest SNR in this window
         loudest = numpy.argmax(trigger_snrs[ifo][nearby])
-        # Convert to the indexin the trigger file
+        # Convert to the index in the trigger file
         nearby_trigger_idx = trigger_idx[ifo][nearby][loudest]
         # Make the info snippet
         sngl_info = mini.make_sngl_ifo(workflow, sngl, tmpltbank_file,
@@ -381,19 +383,31 @@ for num_event in range(num_events):
                             use_exact_inj_params=True)
 
     for curr_ifo in args.single_detector_triggers:
+        # Finding loudest template in this detector near to the injection:
+        # First, find triggers close to the missed injection
         single_fname = args.single_detector_triggers[curr_ifo]
-        hd_sngl = SingleDetTriggers(single_fname, args.bank_file, None, None,
-                                    None, curr_ifo)
-        end_times = hd_sngl.end_time
-        # Use SNR here or NewSNR??
-        snr = hd_sngl.snr
-        lgc_mask = abs(end_times - inj_params['tc']) < args.inj_window
-
-        if len(snr[lgc_mask]) == 0:
+        idx, _ = HFile(single_fname).select(
+            lambda t: abs(t - inj_params['tc']) < args.inj_window,
+            f'{curr_ifo}/end_time',
+            return_data=False,
+        )
+
+        if len(idx) == 0:
+            # No events in the injection window for this detector
             continue
 
-        snr_idx = numpy.arange(len(lgc_mask))[lgc_mask][snr[lgc_mask].argmax()]
-        hd_sngl.mask = [snr_idx]
+        hd_sngl = SingleDetTriggers(
+            single_fname,
+            curr_ifo,
+            bank_file=args.bank_file,
+            premask=idx
+        )
+        # Next, find the loudest within this set of triggers
+        # Use SNR here or NewSNR, or other??
+        loudest_idx = hd_sngl.snr.argmax()
+        hd_sngl.apply_mask([loudest_idx])
+
+        # What are the parameters of this trigger?
         curr_params = copy.deepcopy(inj_params)
         curr_params['mass1'] = hd_sngl.mass1[0]
         curr_params['mass2'] = hd_sngl.mass2[0]

diff --git a/bin/minifollowups/pycbc_page_snglinfo b/bin/minifollowups/pycbc_page_snglinfo
@@ -21,7 +21,7 @@ matplotlib.use('Agg')
 import lal
 import pycbc.version, pycbc.events, pycbc.results, pycbc.pnutils
 from pycbc.results import followup
-from pycbc.events import stat
+from pycbc.events import stat as pystat
 from pycbc.io import hdf
 
 
@@ -62,37 +62,53 @@ parser.add_argument('--include-gracedb-link', action='store_true',
 parser.add_argument('--significance-file',
     help="If given, will search for this trigger's id in the file to see if "
          "stat and p_astro values exists for this trigger.")
-stat.insert_statistic_option_group(parser,
+pystat.insert_statistic_option_group(parser,
     default_ranking_statistic='single_ranking_only')
 
 
 args = parser.parse_args()
 
 pycbc.init_logging(args.verbose)
 
+if args.trigger_id is not None:
+    # Make a mask which is just the trigger of interest
+    mask = numpy.array([args.trigger_id])
+else:
+    mask = None
+
 # Get the single-ifo triggers
-sngl_file = hdf.SingleDetTriggers(args.single_trigger_file, args.bank_file,
-                args.veto_file, args.veto_segment_name, None, args.instrument)
+sngl_file = hdf.SingleDetTriggers(
+    args.single_trigger_file,
+    args.instrument,
+    bank_file=args.bank_file,
+    veto_file=args.veto_file,
+    segment_name=args.veto_segment_name,
+    premask=mask,
+)
+
+rank_method = pystat.get_statistic_from_opts(args, [args.instrument])
 
-rank_method = stat.get_statistic_from_opts(args, [args.instrument])
 if args.trigger_id is not None:
-    sngl_file.mask = [args.trigger_id]
-    trig_id = args.trigger_id
-    sngl_file.mask_to_n_loudest_clustered_events(rank_method, n_loudest=1)
-    stat = sngl_file.stat[0]
+    # Mask already applied
+    pass
 elif args.n_loudest is not None:
     # Cluster by a ranking statistic and retain only the loudest n clustered
     # triggers
-    sngl_file.mask_to_n_loudest_clustered_events(rank_method,
-                                                 n_loudest=args.n_loudest+1)
+    sngl_file.mask_to_n_loudest_clustered_events(
+        rank_method,
+        n_loudest=args.n_loudest+1
+    )
+else:
+    raise ValueError("Must give --n-loudest or --trigger-id.")
+
+sds = rank_method.single(sngl_file.trig_dict())
+stat = rank_method.rank_stat_single((sngl_file.ifo, sds))
+if args.n_loudest is not None:
     # Restrict to only the nth loudest, instead of all triggers up to nth
     # loudest
-    stat = sngl_file.stat
     l = stat.argsort()
     stat = stat[l[0]]
-    sngl_file.mask = sngl_file.mask[l[0]]
-else:
-    raise ValueError("Must give --n-loudest or --trigger-id.")
+    sngl_file.apply_mask(l[0])
 
 # make a table for the single detector information ############################
 time = sngl_file.end_time
@@ -142,7 +158,9 @@ if args.ranking_statistic in ["quadsum", "single_ranking_only"]:
 else:
     # Name would be too long - just call it ranking statistic
     stat_name = 'Ranking Statistic'
-    stat_name_long = ' with '.join([args.ranking_statistic, args.sngl_ranking])
+    stat_name_long = ' with '.join(
+        [args.ranking_statistic, args.sngl_ranking]
+    )
 
 headers.append(stat_name)
 

diff --git a/bin/minifollowups/pycbc_plot_trigger_timeseries b/bin/minifollowups/pycbc_plot_trigger_timeseries
@@ -61,9 +61,13 @@ for ifo in args.single_trigger_files.keys():
 
     # Identify trigger idxs within window of trigger time
     with HFile(args.single_trigger_files[ifo], 'r') as data:
-        idx, _ = data.select(lambda endtime: abs(endtime - t) < args.window,
-                             f'{ifo}/end_time', return_indices=True)
-        data_mask = numpy.zeros(len(data[f'{ifo}/snr']), dtype=bool)
+        idx, _ = data.select(
+            lambda endtime: abs(endtime - t) < args.window,
+            'end_time',
+            group=ifo,
+            return_data=False,
+        )
+        data_mask = numpy.zeros(data[ifo]['snr'].size, dtype=bool)
         data_mask[idx] = True
 
     if not len(idx):
@@ -75,13 +79,8 @@ for ifo in args.single_trigger_files.keys():
                       label=ifo)
         continue
 
-    # Not using bank file, or veto file, so lots of "None"s here.
     trigs = SingleDetTriggers(
         args.single_trigger_files[ifo],
-        None,
-        None,
-        None,
-        None,
         ifo,
         premask=data_mask
     )

diff --git a/bin/minifollowups/pycbc_sngl_minifollowup b/bin/minifollowups/pycbc_sngl_minifollowup
@@ -65,8 +65,9 @@ parser.add_argument('--inspiral-data-read-name',
 parser.add_argument('--inspiral-data-analyzed-name',
                     help="Name of inspiral segmentlist containing data "
                          "analyzed by each analysis job.")
-parser.add_argument('--min-snr', type=float, default=6.5,
-                    help="Minimum SNR to consider for loudest triggers")
+parser.add_argument('--min-sngl-ranking', type=float, default=6.5,
+                    help="Minimum sngl-ranking to consider for loudest "
+                         "triggers. Default=6.5.")
 parser.add_argument('--non-coinc-time-only', action='store_true',
                     help="If given remove (veto) single-detector triggers "
                          "that occur during a time when at least one other "
@@ -130,31 +131,14 @@ insp_data_seglists.coalesce()
 num_events = int(workflow.cp.get_opt_tags('workflow-sngl_minifollowups',
                  'num-sngl-events', ''))
 
-# This helps speed up the processing to ignore a large fraction of triggers
-mask = None
-f = hdf.HFile(args.single_detector_file, 'r')
-n_triggers = f['{}/snr'.format(args.instrument)].size
-logging.info("%i triggers in file", n_triggers)
-if args.min_snr:
-    logging.info('Calculating Prefilter')
-    idx, _ = f.select(lambda snr: snr > args.min_snr,
-                      '{}/snr'.format(args.instrument),
-                      return_indices=True)
-    mask = numpy.zeros(n_triggers, dtype=bool)
-    mask[idx] = True
-    if len(idx) < num_events:
-        logging.info("Fewer triggers exist after the --min-snr cut (%d) "
-                     "than requested for the minifollowup (%d)",
-                     len(idx), num_events)
-
 trigs = hdf.SingleDetTriggers(
     args.single_detector_file,
-    args.bank_file,
-    args.foreground_censor_file,
-    args.foreground_segment_name,
-    None,
     args.instrument,
-    premask=mask
+    bank_file=args.bank_file,
+    veto_file=args.foreground_censor_file,
+    segment_name=args.foreground_segment_name,
+    filter_rank=args.sngl_ranking,
+    filter_threshold=args.min_sngl_ranking
 )
 
 # Include gating vetoes