Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handling vetoes and timeslides in pygrb mini followups #4941

Merged
merged 9 commits into from
Nov 19, 2024
85 changes: 71 additions & 14 deletions bin/pygrb/pycbc_pygrb_minifollowups
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,23 @@ def add_wiki_row(outfile, cols):


def make_timeseries_plot(workflow, trig_file, snr_type, central_time,
shift_time, out_dir, ifo=None, tags=None):
shift_time, out_dir, ifo=None, seg_files=None,
veto_file=None, slide_id=0, tags=None):
"""Adds a node for a timeseries of PyGRB results to the workflow"""

tags = [] if tags is None else tags

# If necessary, overwrite the slide-id configuration option to ensure only
# the correct timeslide is used in these follow-up plots
orig_slide_id = None
if workflow.cp.has_option('pygrb_plot_snr_timeseries','slide-id'):
orig_slide_id = workflow.cp.get('pygrb_plot_snr_timeseries','slide-id')
workflow.cp.add_options_to_section(
'pygrb_plot_snr_timeseries',
[('slide-id', str(slide_id))],
True
)

# Initialize job node with its tags
grb_name = workflow.cp.get('workflow', 'trigger-name')
extra_tags = ['GRB'+grb_name]
Expand All @@ -71,6 +83,14 @@ def make_timeseries_plot(workflow, trig_file, snr_type, central_time,
ifos=workflow.ifos, out_dir=out_dir,
tags=tags+extra_tags).create_node()
node.add_input_opt('--trig-file', trig_file)
# Include the onsource trial if this is a follow up on the onsource
if 'loudest_onsource_event' in tags:
node.add_opt('--onsource')
# Pass the segments files and veto file
if seg_files:
node.add_input_list_opt('--seg-files', seg_files)
if veto_file:
node.add_input_opt('--veto-file', veto_file)
node.new_output_file_opt(workflow.analysis_time, '.png',
'--output-file', tags=extra_tags)
# Quantity to be displayed on the y-axis of the plot
Expand All @@ -93,6 +113,14 @@ def make_timeseries_plot(workflow, trig_file, snr_type, central_time,
# Add job node to workflow
workflow += node

# Revert back the original slide-id option if necessary
if orig_slide_id:
workflow.cp.add_options_to_section(
'pygrb_plot_snr_timeseries',
[('slide-id', orig_slide_id)],
True
)

return node.output_files


Expand All @@ -107,6 +135,11 @@ parser.add_argument('--followups-file',
help="HDF file with the triggers/injections to follow up")
parser.add_argument('--wiki-file',
help="Name of file to save wiki-formatted table in")
parser.add_argument("-a", "--seg-files", nargs="+", action="store",
default=[], help="The location of the buffer, " +
"onsource and offsource txt segment files.")
parser.add_argument("-V", "--veto-file", action="store",
help="The location of the xml veto file.")
wf.add_workflow_command_line_group(parser)
wf.add_workflow_settings_cli(parser, include_subdax_opts=True)
ppu.pygrb_add_bestnr_cut_opt(parser)
Expand Down Expand Up @@ -142,6 +175,19 @@ trig_file = resolve_url_to_file(os.path.abspath(args.trig_file))
ifos = ppu.extract_ifos(os.path.abspath(args.trig_file))
num_ifos = len(ifos)

# File instance of the veto file
veto_file = args.veto_file
start_rundir = os.getcwd()
if veto_file:
veto_file = os.path.join(start_rundir, args.veto_file)
veto_file = wf.resolve_url_to_file(veto_file)

# Convert the segments files to a FileList
seg_files = wf.FileList([
wf.resolve_url_to_file(os.path.join(start_rundir, f))
for f in args.seg_files
])

# (Loudest) off/on-source events are on time-slid data so the
# try will succeed, as it finds the time shift columns.
is_injection_followup = True
Expand All @@ -151,7 +197,6 @@ try:
except:
pass


# Loop over triggers/injections to be followed up
for num_event in range(num_events):
files = FileList([])
Expand All @@ -163,29 +208,41 @@ for num_event in range(num_events):
for key in fp.keys():
row.append(fp[key][num_event])
add_wiki_row(wiki_file, row)
# Handle injections (which are on unslid data)
if is_injection_followup:
for snr_type in ['reweighted', 'coherent']:
files += make_timeseries_plot(workflow, trig_file,
snr_type, gps_time, gps_time,
args.output_dir, ifo=None,
seg_files=seg_files,
veto_file=veto_file,
slide_id=0,
tags=args.tags + [str(num_event)])
for ifo in ifos:
files += mini.make_qscan_plot(workflow, ifo, gps_time,
args.output_dir,
tags=args.tags + [str(num_event)])
pannarale marked this conversation as resolved.
Show resolved Hide resolved
# Handle off/on-source loudest triggers follow-up (which are on slid data)
if not is_injection_followup:
else:
# If the file has information about slides, this is the follow up of an
# offsource event, otherwise it is the loudest onsource event, and
# zero-lag data must be used
slide_id = 0
if 'Slide Num' in fp.keys():
slide_id = fp['Slide Num'][num_event]
pannarale marked this conversation as resolved.
Show resolved Hide resolved
for ifo in ifos:
time_shift = fp[ifo+' time shift (s)'][num_event]
ifo_time = gps_time - time_shift
files += make_timeseries_plot(workflow, trig_file,
'single', gps_time, ifo_time,
args.output_dir, ifo=ifo,
seg_files=seg_files,
veto_file=veto_file,
slide_id=slide_id,
tags=args.tags + [str(num_event)])
files += mini.make_qscan_plot(workflow, ifo, ifo_time,
args.output_dir,
tags=args.tags + [str(num_event)])
# Handle injections (which are on unslid data)
else:
for snr_type in ['reweighted', 'coherent']:
files += make_timeseries_plot(workflow, trig_file,
snr_type, gps_time, gps_time,
args.output_dir, ifo=None,
tags=args.tags + [str(num_event)])
for ifo in ifos:
files += mini.make_qscan_plot(workflow, ifo, gps_time,
args.output_dir,
tags=args.tags + [str(num_event)])

layouts += list(layout.grouper(files, 2))

Expand Down
2 changes: 1 addition & 1 deletion bin/pygrb/pycbc_pygrb_page_tables
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ if lofft_outfile:
d.append(bestnr)
td.append(d)

# th: table header
# th: table header [pycbc_pygrb_minifollowups looks for 'Slide Num']
th = ['Trial', 'Slide Num', 'p-value', 'GPS time',
'Rec. m1', 'Rec. m2', 'Rec. Mc', 'Rec. spin1z', 'Rec. spin2z',
'Rec. RA', 'Rec. Dec', 'SNR', 'Chi^2', 'Null SNR']
Expand Down
78 changes: 39 additions & 39 deletions pycbc/results/pygrb_postprocessing_utils.py
pannarale marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
Module to generate PyGRB figures: scatter plots and timeseries.
"""

import os
import logging
import argparse
import copy
Expand All @@ -33,6 +32,7 @@
from scipy import stats
import ligo.segments as segments
from pycbc.events.coherent import reweightedsnr_cut
from pycbc.events import veto
from pycbc import add_common_pycbc_options
from pycbc.io.hdf import HFile

Expand Down Expand Up @@ -242,7 +242,7 @@ def _extract_vetoes(veto_file, ifos, offsource):
vetoes[ifo] = segments.segmentlist([offsource]) - clean_segs[ifo]
vetoes.coalesce()
for ifo in ifos:
for v in vetoes[ifo]:
for v in vetoes[ifo]:
v_span = v[1] - v[0]
logging.info("%ds of data vetoed at GPS time %d",
v_span, v[0])
Expand All @@ -269,7 +269,7 @@ def _slide_vetoes(vetoes, slide_dict_or_list, slide_id, ifos):


# =============================================================================
# Recursive function to reach all datasets in an HDF file handle
# Recursive function to reach all datasets in an HDF file handle
# =============================================================================
def _dataset_iterator(g, prefix=''):
"""Reach all datasets in an HDF file handle"""
Expand Down Expand Up @@ -368,9 +368,6 @@ def load_data(input_file, ifos, rw_snr_threshold=None, data_tag=None,
return trigs_dict





# =============================================================================
# Detector utils:
# * Function to calculate the antenna response F+^2 + Fx^2
Expand Down Expand Up @@ -460,38 +457,35 @@ def sort_trigs(trial_dict, trigs, slide_dict, seg_dict):
# =============================================================================
# Extract trigger properties and store them as dictionaries
# =============================================================================
def extract_basic_trig_properties(trial_dict, trigs, slide_dict, seg_dict,
opts):
"""Extract and store as dictionaries time, SNR, and BestNR of
time-slid triggers"""
def extract_trig_properties(trial_dict, trigs, slide_dict, seg_dict, keys):
"""Extract and store as dictionaries specific keys of time-slid
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with the change of name of the function. But since this and its precursor do two different things, shouldn't we keep them both? (see also comment on the output)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See the answer above. The call to the old function will be removed throughout PyGRB.

triggers (trigs) compatibly with the trials dictionary (trial_dict)"""

# Sort the triggers into each slide
sorted_trigs = sort_trigs(trial_dict, trigs, slide_dict, seg_dict)
logger.info("Triggers sorted.")
n_surviving_trigs = sum([len(i) for i in sorted_trigs.values()])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would leave the log info here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I removed this because the logging info below is more informative, while is still telling the user the triggers were sorted.

msg = f"{n_surviving_trigs} triggers found within the trials dictionary "
msg += "and sorted."
logging.info(msg)

found_trigs = {}
for key in keys:
found_trigs[key] = {}

# Build the 3 dictionaries
trig_time = {}
trig_snr = {}
trig_bestnr = {}
for slide_id in slide_dict:
slide_trigs = sorted_trigs[slide_id]
indices = numpy.nonzero(
numpy.isin(trigs['network/event_id'], slide_trigs))[0]
if slide_trigs:
trig_time[slide_id] = trigs['network/end_time_gc'][
indices]
trig_snr[slide_id] = trigs['network/coherent_snr'][
indices]
else:
trig_time[slide_id] = numpy.asarray([])
trig_snr[slide_id] = numpy.asarray([])
trig_bestnr[slide_id] = reweightedsnr_cut(
trigs['network/reweighted_snr'][indices],
opts.newsnr_threshold)
Comment on lines -488 to -490
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aren't we using this anymore? If just one of the functions is kept, maybe one could return the found_trigsas well as the bestNR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The user will get the 'network/reweighted_snr' by calling extract_trig_properties with 'network/reweighted_snr' among the keys. So this function is not returning the found_trigs dictionary with the keys requested (and 'network/reweighted_snr' may be one of these).

for key in keys:
if slide_trigs:
found_trigs[key][slide_id] = get_coinc_snr(trigs)[indices] \
if key == 'network/coincident_snr' else trigs[key][indices]
else:
found_trigs[key][slide_id] = numpy.asarray([])

logger.info("Time, SNR, and BestNR of triggers extracted.")
logging.info("Triggers information extracted.")

return trig_time, trig_snr, trig_bestnr
return found_trigs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pycbc_pygrb_efficiency, pycbc_pygrb_page_tables, and pycbc_pygrb_plot_stats_distribution are using this function and need three outputs not to break.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct, in the development version these scripts will call extract_trig_properties to get these quantities. But, I am breaking down the many changes in multiple PRs so that the diffs may be parsed with reasonable effort: one big PR would be complicated to handle for a reviewer.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with breaking it down, but approving this PR before any of the others would find difficult to run workflows without issues.

Copy link
Contributor Author

@pannarale pannarale Nov 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was true for quite a while until PR #4909, when PyCBC allowed to generate a (working) PyGRB workflow, but without vetoes. As for previous "PR series" that took us to that point, the idea is that we are enabling a new big feature and, by breaking changes down, accepting that the intermediate states of PyCBC between PR #4909 and the completion of this PR review work will break PyGRB again. At that point we will have a PyCBC where PyGRB workflows are fully working and with vetoes. It will then be the right moment to have a new CI/CD test with a small PyGRB search and hence stop this modus operandi. At the moment PyGRB is already broken on master :-)



# =============================================================================
Expand Down Expand Up @@ -582,9 +576,11 @@ def load_segment_dict(hdf_file_path):
# =============================================================================
# Construct the trials from the timeslides, segments, and vetoes
# =============================================================================
def construct_trials(seg_files, seg_dict, ifos, slide_dict, vetoes):
"""Constructs trials from triggers, timeslides, segments and vetoes"""
def construct_trials(seg_files, seg_dict, ifos, slide_dict, veto_file,
hide_onsource=True):
"""Constructs trials from segments, timeslides, and vetoes"""

logging.info("Constructing trials.")
trial_dict = {}

# Get segments
Expand All @@ -593,19 +589,23 @@ def construct_trials(seg_files, seg_dict, ifos, slide_dict, vetoes):
# Separate segments
trial_time = abs(segs['on'])

# Determine the veto segments
vetoes = _extract_vetoes(veto_file, ifos, segs['off'])

# Slide vetoes over trials: this can only *reduce* the analysis time
for slide_id in slide_dict:
# These can only *reduce* the analysis time
curr_seg_list = seg_dict[slide_id]

# Construct the buffer segment list
# Fill in the buffer segment list if the onsource data must be hidden
seg_buffer = segments.segmentlist()
for ifo in ifos:
slide_offset = slide_dict[slide_id][ifo]
seg_buffer.append(segments.segment(segs['buffer'][0] -
slide_offset,
segs['buffer'][1] -
slide_offset))
seg_buffer.coalesce()
if hide_onsource:
for ifo in ifos:
slide_offset = slide_dict[slide_id][ifo]
seg_buffer.append(segments.segment(segs['buffer'][0] -
slide_offset,
segs['buffer'][1] -
slide_offset))
seg_buffer.coalesce()

# Construct the ifo-indexed dictionary of slid veteoes
slid_vetoes = _slide_vetoes(vetoes, slide_dict, slide_id, ifos)
Expand Down
Loading