Add dq fitting to Live supervised fitting script (gwastro#4866)

* Add 'dq_state' to list of 'region_ref_datasets' in trigger collation script * Added method skeletons for dq trigger rate calculations to supervision script * Finish first draft of supervision script * First draft of daily dq trigger rates script * Modify handling of template bins to match offline * Split MDC original date calculation into a separate function * Add pycbc_live_combine_dq_trigger_rates script * Make pycbc_plot_dq_flag_likelihood work with low-latency dq * Accept analysis flag name and frame type as arguments to daily dq job * Make plotting work, and other small fixes * Format variable dq file name * Added option to use pregenerated trigger file * for real this time * Gareth comments * Renamed daily -> collated * daily -> collated inside supervision script * Use deterministic hashing * Try to fix h5py error * Use h5py File instead of PyCBC HFile for outputs * HFile -> h5py for outputs * One more Gareth comment
prayush · Nov 21, 2024 · 3b694bc · 3b694bc
1 parent 172c4c0
commit 3b694bc
Show file tree

Hide file tree

Showing 6 changed files with 598 additions and 115 deletions.
diff --git a/bin/all_sky_search/pycbc_bin_templates b/bin/all_sky_search/pycbc_bin_templates
@@ -48,7 +48,8 @@ with h5.File(args.output_file, 'w') as f:
         bin_tids = bin_dict[bin_name]
         grp = ifo_grp.create_group(bin_name)
         grp['tids'] = bin_tids
+    f.attrs['bank_file'] = args.bank_file
     f.attrs['f_lower'] = args.f_lower
-    f.attrs['background_bins'] = args.background_bins
+    f.attrs['background_bins'] = ' '.join(args.background_bins)
 
 logging.info('Finished')
diff --git a/bin/live/pycbc_live_collate_triggers b/bin/live/pycbc_live_collate_triggers
@@ -231,14 +231,15 @@ with h5py.File(args.output_file, 'w') as destination:
                 sorted_key = triggers[key][:][sorted_indices]
                 triggers[key][:] = sorted_key
 
-
         logging.info("Setting up region references")
         # Datasets which need region references:
         region_ref_datasets = ('chisq_dof', 'chisq', 'coa_phase',
                                'end_time', 'sg_chisq', 'snr',
                                'template_duration', 'sigmasq')
         if 'psd_var_val' in triggers.keys():
             region_ref_datasets += ('psd_var_val',)
+        if 'dq_state' in triggers.keys():
+            region_ref_datasets += ('dq_state',)
 
         start_boundaries = template_boundaries
         end_boundaries = numpy.roll(start_boundaries, -1)

diff --git a/bin/live/pycbc_live_collated_dq_trigger_rates b/bin/live/pycbc_live_collated_dq_trigger_rates
@@ -0,0 +1,157 @@
+#!/usr/bin/python
+
+# Copyright 2024 Max Trevor
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+
+"""
+Calculate the noise trigger rate adjustment as a function of data-quality state
+for a day of PyCBC Live triggers.
+"""
+
+import logging
+import argparse
+import hashlib
+
+import numpy as np
+import h5py as h5
+
+import pycbc
+from pycbc.io import HFile
+
+import gwdatafind
+
+from gwpy.timeseries import TimeSeriesDict
+from gwpy.segments import Segment, DataQualityFlag
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument("--trigger-file", required=True)
+parser.add_argument("--ifo", required=True)
+parser.add_argument("--gps-start-time", required=True, type=int)
+parser.add_argument("--gps-end-time", required=True, type=int)
+parser.add_argument("--template-bin-file", required=True)
+parser.add_argument("--analysis-frame-type", required=True)
+parser.add_argument("--analysis-flag-name", required=True)
+parser.add_argument("--dq-channel", required=True)
+parser.add_argument("--dq-ok-channel", required=True)
+parser.add_argument("--dq-thresh", required=True, type=float)
+parser.add_argument("--output", required=True)
+pycbc.add_common_pycbc_options(parser)
+args = parser.parse_args()
+
+pycbc.init_logging(args.verbose)
+
+# Get observing segs
+ar_flag_name = args.analysis_flag_name.format(ifo=args.ifo)
+day_seg = Segment(args.gps_start_time, args.gps_end_time)
+observing_flag = DataQualityFlag.query(ar_flag_name, day_seg)
+observing_segs = observing_flag.active
+livetime = observing_flag.livetime
+logging.info(f'Found {livetime} seconds of observing time at {args.ifo}.')
+
+# for each segment, check how much time was dq flagged
+flagged_time = 0
+frame_type = args.analysis_frame_type.format(ifo=args.ifo)
+dq_channel = args.dq_channel.format(ifo=args.ifo)
+dq_ok_channel = args.dq_ok_channel.format(ifo=args.ifo)
+for seg in observing_segs:
+    frames = gwdatafind.find_urls(
+        args.ifo[0],
+        frame_type,
+        seg[0],
+        seg[1],
+    )
+    tsdict = TimeSeriesDict.read(
+        frames,
+        channels=[dq_channel, dq_ok_channel],
+        start=seg[0],
+        end=seg[1],
+    )
+    dq_ok_flag = (tsdict[dq_ok_channel] == 1).to_dqflag()
+    dq_flag = (tsdict[dq_channel] <= args.dq_thresh).to_dqflag()
+    flagged_time += (dq_flag & dq_ok_flag).livetime
+logging.info(f'Found {flagged_time} seconds of dq flagged time at {args.ifo}.')
+
+bg_livetime = livetime - flagged_time
+state_time = np.array([bg_livetime, flagged_time])
+
+# read in template bins
+logging.info(f'Reading template bins from {args.template_bin_file}')
+template_bins = {}
+num_templates = 0
+with HFile(args.template_bin_file, 'r') as bin_file:
+    f_lower = bin_file.attrs['f_lower']
+    bank_file = bin_file.attrs['bank_file']
+    bin_string = bin_file.attrs['background_bins']
+
+    # only ever one group in this file
+    grp = bin_file[list(bin_file.keys())[0]]
+    num_bins = len(grp.keys())
+    for k in grp.keys():
+        template_bins[k] = grp[k]['tids'][:]
+        num_templates += len(template_bins[k])
+
+# for each bin, get total number of triggers and number of dq flagged triggers
+bin_total_triggers = np.zeros(num_bins)
+bin_dq_triggers = np.zeros(num_bins)
+
+logging.info(f'Reading triggers from {args.trigger_file}')
+with HFile(args.trigger_file, 'r') as trigf:
+    for bin_name in template_bins.keys():
+        # bins are named as 'bin{bin_num}'
+        bin_num = int(bin_name[3:])
+        template_ids = template_bins[bin_name]
+        for template_id in template_ids:
+            # get dq states for all triggers with this template
+            # dq state is either 0 or 1 for each trigger
+            dq_key = trigf[f'{args.ifo}/dq_state_template'][template_id]
+            dq_states = trigf[f'{args.ifo}/dq_state'][dq_key]
+
+            # update trigger counts
+            bin_total_triggers[bin_num] += len(dq_states)
+            bin_dq_triggers[bin_num] += np.sum(dq_states)
+
+# write outputs to file
+logging.info(f'Writing results to {args.output}')
+with h5.File(args.output, 'w') as f:
+    ifo_group = f.create_group(args.ifo)
+    ifo_group.create_dataset('observing_livetime', data=livetime)
+    ifo_group.create_dataset('dq_flag_livetime', data=flagged_time)
+    bin_group = ifo_group.create_group('bins')
+    for bin_name in template_bins.keys():
+        bin_num = int(bin_name[3:])
+        bgrp = bin_group.create_group(bin_name)
+        bgrp.create_dataset('tids', data=template_bins[bin_name])
+        bgrp.create_dataset('total_triggers', data=bin_total_triggers[bin_num])
+        bgrp.create_dataset('dq_triggers', data=bin_dq_triggers[bin_num])
+
+        bg_triggers = bin_total_triggers[bin_num] - bin_dq_triggers[bin_num]
+        num_trigs = np.array([bg_triggers, bin_dq_triggers[bin_num]])
+        trig_rates = num_trigs / state_time
+        mean_rate = bin_total_triggers[bin_num] / livetime
+        normalized_rates = trig_rates / mean_rate
+        bgrp.create_dataset('dq_rates', data=normalized_rates)
+
+    f.attrs['dq_thresh'] = args.dq_thresh
+    f.attrs['dq_channel'] = dq_channel
+    f.attrs['dq_ok_channel'] = dq_ok_channel
+    f.attrs['gps_start_time'] = args.gps_start_time
+    f.attrs['gps_end_time'] = args.gps_end_time
+    f.attrs['f_lower'] = f_lower
+    f.attrs['bank_file'] = bank_file
+    f.attrs['background_bins'] = bin_string
+
+    # hash is used to check if different files have compatible settings
+    settings_to_hash = [args.dq_thresh, dq_channel, dq_ok_channel,
+                        f_lower, bank_file, bin_string]
+    setting_str = ' '.join([str(s) for s in settings_to_hash])
+    hash_object = hashlib.sha256(setting_str.encode())
+    f.attrs['settings_hash'] = hash_object.hexdigest()
diff --git a/bin/live/pycbc_live_combine_dq_trigger_rates b/bin/live/pycbc_live_combine_dq_trigger_rates
@@ -0,0 +1,104 @@
+#!/usr/bin/python
+
+# Copyright 2024 Max Trevor
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 3 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+
+"""Combine the data-quality adjusted trigger rates from multiple days."""
+
+import logging
+import argparse
+
+import numpy as np
+import h5py as h5
+
+import pycbc
+from pycbc.io import HFile
+
+parser = argparse.ArgumentParser(description=__doc__)
+pycbc.add_common_pycbc_options(parser)
+parser.add_argument("--daily-dq-files", nargs="+", required=True,
+                    help="Files containing daily dq trigger rates")
+parser.add_argument("--ifo", required=True)
+parser.add_argument("--output", required=True)
+args = parser.parse_args()
+
+pycbc.init_logging(args.verbose)
+
+daily_files = args.daily_dq_files
+daily_files.sort()
+
+# need all files to use compatible settings
+# get settings hash from the last file
+# we will only use files that have the same hash
+with HFile(daily_files[-1], 'r') as last_file:
+    settings_hash = last_file.attrs['settings_hash']
+    bin_str = last_file.attrs['background_bins']
+    bank_file = last_file.attrs['bank_file']
+    f_lower = last_file.attrs['f_lower']
+    dq_thresh = last_file.attrs['dq_thresh']
+    dq_channel = last_file.attrs['dq_channel']
+    dq_ok_channel = last_file.attrs['dq_ok_channel']
+    bin_group = last_file[f'{args.ifo}/bins']
+    template_bins = {bin_name: bin_group[bin_name]['tids'][:]
+                     for bin_name in bin_group.keys()}
+num_bins = len(bin_str.split())
+
+total_livetime = 0
+flagged_livetime = 0
+total_triggers = np.zeros(num_bins)
+flagged_triggers = np.zeros(num_bins)
+for fpath in daily_files:
+    with HFile(fpath, 'r') as f:
+        if f.attrs['settings_hash'] != settings_hash:
+            warning_str = f'File {fpath} has incompatible settings, skipping'
+            logging.warning(warning_str)
+            continue
+        total_livetime += f[f'{args.ifo}/observing_livetime'][()]
+        flagged_livetime += f[f'{args.ifo}/dq_flag_livetime'][()]
+        bin_group = f[f'{args.ifo}/bins']
+        for bin_name in bin_group.keys():
+            # bins are named as 'bin{bin_num}'
+            bin_num = int(bin_name[3:])
+            bgrp = bin_group[bin_name]
+            total_triggers[bin_num] += bgrp['total_triggers'][()]
+            flagged_triggers[bin_num] += bgrp['dq_triggers'][()]
+
+total_trigger_rate = total_triggers / total_livetime
+flag_trigger_rate = flagged_triggers / flagged_livetime
+bg_triggers = total_triggers - flagged_triggers
+bg_livetime = total_livetime - flagged_livetime
+bg_trigger_rate = bg_triggers / bg_livetime
+
+# save results
+with h5.File(args.output, 'w') as f:
+    ifo_grp = f.create_group(args.ifo)
+    all_bin_grp = ifo_grp.create_group('bins')
+
+    for bin_name, bin_tids in template_bins.items():
+        bin_grp = all_bin_grp.create_group(bin_name)
+        bin_grp['tids'] = bin_tids
+        bin_num = int(bin_name[3:])
+        bin_trig_rates = [bg_trigger_rate[bin_num], flag_trigger_rate[bin_num]]
+        bin_trig_rates /= total_trigger_rate[bin_num]
+        bin_grp['dq_rates'] = bin_trig_rates
+        bin_grp['num_triggers'] = total_triggers[bin_num]
+
+    f.attrs['settings_hash'] = settings_hash
+    f.attrs['stat'] = f'{args.ifo}-dq_stat_info'
+    f.attrs['total_livetime'] = total_livetime
+    f.attrs['flagged_livetime'] = flagged_livetime
+    f.attrs['dq_thresh'] = dq_thresh
+    f.attrs['dq_channel'] = dq_channel
+    f.attrs['dq_ok_channel'] = dq_ok_channel
+    f.attrs['background_bins'] = bin_str
+    f.attrs['bank_file'] = bank_file
+    f.attrs['f_lower'] = f_lower