Skip to content

Commit

Permalink
Ranking statistic for live singles (gwastro#4689)
Browse files Browse the repository at this point in the history
* Allow the live single trigger fits to use ranking statistic rather than sngl-ranking

* inbin is no longer all the events above threshold, plotting to indicate with case where no triggers are found

* deal better with cases where there are no triggers

* Use ranking statistic for single-detector events

* Fix some errors

* fix some statistics so they can produce single-detector events

* Some codeclimate suggestions

* get fit coeff files into CI, set a maximum IFAR for singles

* alter the CI example run

* Codeclimate suggestions

* Line too long

* minor tweaks

* Used shared code

* Fix broken fixing

* missed that this needs the module

* typo

* calculate plotmax earlier and use it to decide the histogram bins

* Update bin/live/pycbc_live_plot_single_significance_fits

Co-authored-by: Tito Dal Canton <tito.dalcanton@ijclab.in2p3.fr>

* TDC comments

* Update threshold naming and description

* update argument in example

* Please do not look at the previous commit and see how much of an idiot I am

* Update bin/live/pycbc_live_combine_single_significance_fits

Co-authored-by: Tito Dal Canton <tito.dalcanton@ijclab.in2p3.fr>

---------

Co-authored-by: Tito Dal Canton <tito.dalcanton@ijclab.in2p3.fr>
  • Loading branch information
GarethCabournDavies and titodalcanton authored Jun 11, 2024
1 parent 9154481 commit d486ec2
Show file tree
Hide file tree
Showing 9 changed files with 266 additions and 101 deletions.
19 changes: 13 additions & 6 deletions bin/live/pycbc_live_combine_single_significance_fits
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,10 @@ for f in args.trfits_files:
bl = fit_f['bins_lower'][:]
bu = fit_f['bins_upper'][:]
sngl_rank = fit_f.attrs['sngl_ranking']
fit_thresh = fit_f.attrs['fit_threshold']
fit_func = fit_f.attrs['fit_function']
fit_thresh = {ifo: fit_f[ifo].attrs['fit_threshold']
for ifo in args.ifos}
fit_func = {ifo: fit_f[ifo].attrs['fit_function']
for ifo in args.ifos}

# Now go back through the fit files and read the actual information. Skip the
# files that have fit parameters inconsistent with what we found earlier.
Expand All @@ -80,9 +82,13 @@ for f in args.trfits_files:
with h5py.File(f, 'r') as fits_f:
# Check that the file uses the same setup as file 0, to make sure
# all coefficients are comparable
new_fit_func = {ifo: fits_f[ifo].attrs['fit_function']
for ifo in args.ifos}
new_fit_thresh = {ifo: fits_f[ifo].attrs['fit_threshold']
for ifo in args.ifos}
same_conf = (fits_f.attrs['sngl_ranking'] == sngl_rank
and fits_f.attrs['fit_threshold'] == fit_thresh
and fits_f.attrs['fit_function'] == fit_func
and new_fit_thresh == fit_thresh
and new_fit_func == fit_func
and fits_f['bins_lower'].size == bl.size
and all(fits_f['bins_lower'][:] == bl)
and all(fits_f['bins_upper'][:] == bu))
Expand All @@ -99,7 +105,7 @@ for f in args.trfits_files:
gps_last = 0
gps_first = np.inf
for ifo in args.ifos:
if ifo not in fits_f:
if ifo not in fits_f or 'triggers' not in fits_f[ifo]:
continue
trig_times = fits_f[ifo]['triggers']['end_time'][:]
gps_last = max(gps_last, trig_times.max())
Expand Down Expand Up @@ -143,7 +149,6 @@ cons_counts_out = {ifo: np.inf * np.ones(len(alphas_bin[ifo])) for ifo in args.i
logging.info("Writing results")

fout = h5py.File(args.output, 'w')
fout.attrs['fit_threshold'] = fit_thresh
fout.attrs['conservative_percentile'] = args.conservative_percentile
fout.attrs['ifos'] = args.ifos
fout['bins_edges'] = list(bl) + [bu[-1]]
Expand All @@ -152,6 +157,8 @@ fout['fits_dates'] = ad + start_time_n
for ifo in args.ifos:
logging.info(ifo)
fout_ifo = fout.create_group(ifo)
fout_ifo.attrs['fit_threshold'] = fit_thresh[ifo]
fout_ifo.attrs['fit_function'] = fit_func[ifo]
l_times = np.array(live_times[ifo])
total_time = l_times.sum()
fout_ifo.attrs['live_time'] = total_time
Expand Down
2 changes: 1 addition & 1 deletion bin/live/pycbc_live_plot_combined_single_significance_fits
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ for ifo in ifos:
continue

l_times = separate_times[ifo]
with np.errstate(divide='ignore'):
with np.errstate(divide='ignore', invalid='ignore'):
rate = counts / l_times

ma = mean_alpha[ifo][i]
Expand Down
50 changes: 37 additions & 13 deletions bin/live/pycbc_live_plot_single_significance_fits
Original file line number Diff line number Diff line change
Expand Up @@ -71,17 +71,22 @@ with h5py.File(args.trigger_fits_file, 'r') as trfit_f:
ifos = [k for k in trfit_f.keys() if not k.startswith('bins')]

# Grab some info from the attributes
sngl_ranking = trfit_f.attrs['sngl_ranking']
fit_threshold = trfit_f.attrs['fit_threshold']
fit_function = trfit_f.attrs['fit_function']
if trfit_f.attrs['ranking_statistic'] in ['quadsum','single_ranking_only']:
x_label = trfit_f.attrs['sngl_ranking']
else:
x_label = "Ranking statistic"
fit_threshold = {ifo: trfit_f[ifo].attrs['fit_threshold'] for ifo in ifos}
fit_function = {ifo: trfit_f[ifo].attrs['fit_function'] for ifo in ifos}
analysis_date = trfit_f.attrs['analysis_date']

# Get the triggers for each detector
# (This is ones which passed the cuts in the fitting code)
stats = {ifo: {} for ifo in ifos}
durations = {ifo: {} for ifo in ifos}
for ifo in ifos:
stats[ifo] = trfit_f[ifo]['triggers'][sngl_ranking][:]
if 'triggers' not in trfit_f[ifo]:
continue
stats[ifo] = trfit_f[ifo]['triggers']['stat'][:]
durations[ifo] = trfit_f[ifo]['triggers']['template_duration'][:]
live_time = {ifo: trfit_f[ifo].attrs['live_time'] for ifo in ifos}
alphas = {ifo: trfit_f[ifo]['fit_coeff'][:] for ifo in ifos}
Expand Down Expand Up @@ -124,7 +129,10 @@ for ifo in all_ifos:
maxstat = stats[ifo].max()
max_rate = 0

plotbins = np.linspace(fit_threshold, 1.05 * maxstat, 400)
statrange = maxstat - max(stats[ifo].min(), fit_threshold[ifo])
plotmax = maxstat + statrange * 0.05

plotbins = np.linspace(fit_threshold[ifo], plotmax, 400)

logging.info("Putting events into bins")
event_bin = np.array([tbins[d] for d in durations[ifo]])
Expand All @@ -134,8 +142,26 @@ for ifo in all_ifos:
binlabel = f"{lower:.3g} - {upper:.3g}"

inbin = event_bin == bin_num
bin_prop = bin_num / len(tbins)
bin_colour = plt.get_cmap(args.colormap)(bin_prop)

# Skip if there are no triggers in this bin in this IFO
if not any(inbin):
if not any(inbin) or alphas[ifo][bin_num] == -1:
ax.plot(
[],
[],
linewidth=2,
color=bin_colour,
label=binlabel,
alpha=0.6
)
ax.plot(
[],
[],
"--",
color=bin_colour,
label="No triggers"
)
continue
binned_sngl_stats = stats[ifo][event_bin == bin_num]

Expand All @@ -147,30 +173,28 @@ for ifo in all_ifos:
max_rate = max(max_rate, cum_rate[0])

ecf = eval_cum_fit(
fit_function,
fit_function[ifo],
plotbins,
alphas[ifo][bin_num],
fit_threshold
fit_threshold[ifo]
)
cum_fit = counts[ifo][bin_num] / live_time[ifo] * ecf

bin_prop = bin_num / len(tbins)
bin_colour = plt.get_cmap(args.colormap)(bin_prop)
ax.plot(edges[:-1], cum_rate, linewidth=2,
color=bin_colour, label=binlabel, alpha=0.6)
ax.plot(plotbins, cum_fit, "--", color=bin_colour,
label=r"$\alpha = $%.2f" % alphas[ifo][bin_num])
ax.semilogy()
ax.grid()
ax.set_xlim(
fit_threshold if args.x_lim_lower is None else args.x_lim_lower,
1.05 * maxstat if args.x_lim_upper is None else args.x_lim_upper
fit_threshold[ifo] if args.x_lim_lower is None else args.x_lim_lower,
plotmax if args.x_lim_upper is None else args.x_lim_upper
)
ax.set_ylim(
0.5 / live_time[ifo] if args.y_lim_lower is None else args.y_lim_lower,
1.5 * max_rate if args.y_lim_upper is None else args.y_lim_upper
)
ax.set_xlabel(sngl_ranking)
ax.set_xlabel(x_label)
ax.set_ylabel("Number of louder triggers per live time")
title = f"{ifo} {analysis_date} trigger fits"
ax.set_title(title)
Expand Down
74 changes: 49 additions & 25 deletions bin/live/pycbc_live_single_significance_fits
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,11 @@ import h5py

import pycbc
from pycbc.bin_utils import IrregularBins
from pycbc.events import cuts, trigger_fits as trstats
from pycbc.events import cuts, trigger_fits as trstats, stat
from pycbc.io import DictArray
from pycbc.events import ranking
from pycbc.events.coinc import cluster_over_time
from pycbc.types import MultiDetOptionAction


def duration_bins_from_cli(args):
Expand Down Expand Up @@ -75,6 +76,7 @@ parser.add_argument("--file-identifier", default="H1L1V1-Live",
help="String required in filename to be considered for "
"analysis. Default: 'H1L1V1-Live'.")
parser.add_argument("--fit-function", default="exponential",
action=MultiDetOptionAction,
choices=["exponential", "rayleigh", "power"],
help="Functional form for the maximum likelihood fit. "
"Choose from exponential, rayleigh or power. "
Expand Down Expand Up @@ -112,18 +114,20 @@ parser.add_argument("--prune-stat-threshold", type=float,
help="Minimum --sngl-ranking value to consider a "
"trigger for pruning.")
parser.add_argument("--fit-threshold", type=float, default=5,
action=MultiDetOptionAction,
help="Lower threshold used in fitting the triggers."
"Default 5.")
"Default 5. Can be supplied as a single value, "
"or as a set of IFO:value pairs, e.g. H1:0:, L1:-1")
parser.add_argument("--cluster", action='store_true',
help="Only use maximum of the --sngl-ranking value "
"from each file.")
parser.add_argument("--output", required=True,
help="File in which to save the output trigger fit "
"parameters.")
parser.add_argument("--sngl-ranking", default="newsnr",
choices=ranking.sngls_ranking_function_dict.keys(),
help="The single-detector trigger ranking to use.")

stat.insert_statistic_option_group(
parser,
default_ranking_statistic='single_ranking_only'
)
cuts.insert_cuts_option_group(parser)

args = parser.parse_args()
Expand All @@ -133,8 +137,11 @@ pycbc.init_logging(args.verbose)
# Check input options

# Pruning options are mutually required or not needed
prune_options = [args.prune_loudest, args.prune_window,
args.prune_stat_threshold]
prune_options = [
args.prune_loudest is not None,
args.prune_window is not None,
args.prune_stat_threshold is not None
]

if any(prune_options) and not all(prune_options):
parser.error("Require all or none of --prune-loudest, "
Expand Down Expand Up @@ -162,6 +169,9 @@ if args.duration_bin_end and \
"--duration-bin-start, got "
f"{args.duration_bin_end} and {args.duration_bin_start}")


stat_kwargs = stat.parse_statistic_keywords_opt(args.statistic_keywords)

duration_bin_edges = duration_bins_from_cli(args)
logging.info("Duration bin edges: %s", duration_bin_edges)

Expand All @@ -177,13 +187,7 @@ args.template_cuts = args.template_cuts or []
args.template_cuts.append(f"template_duration:{min(duration_bin_edges)}:lower")
args.template_cuts.append(f"template_duration:{max(duration_bin_edges)}:upper_inc")

# Efficiency saving: add SNR cut before any others as sngl_ranking can
# only be less than SNR.
args.trigger_cuts = args.trigger_cuts or []
args.trigger_cuts.insert(0, f"snr:{args.fit_threshold}:lower_inc")

# Cut triggers with sngl-ranking below threshold
args.trigger_cuts.append(f"{args.sngl_ranking}:{args.fit_threshold}:lower_inc")

logging.info("Setting up the cut dictionaries")
trigger_cut_dict, template_cut_dict = cuts.ingest_cuts_option_group(args)
Expand Down Expand Up @@ -211,6 +215,9 @@ files = [f for f in os.listdir(date_directory)

events = {}

rank_method = {ifo: stat.get_statistic_from_opts(args, [ifo]) for ifo in args.ifos}

logging.info("Processing %d files", len(files))
for counter, filename in enumerate(files):
if counter and counter % 1000 == 0:
logging.info("Processed %d/%d files", counter, len(files))
Expand Down Expand Up @@ -284,10 +291,13 @@ for counter, filename in enumerate(files):
for k in trigs_ifo.keys()}

# Calculate the sngl_ranking values
sngls_value = ranking.get_sngls_ranking_from_trigs(
triggers_cut, args.sngl_ranking)
sds = rank_method[ifo].single(triggers_cut)
sngls_value = rank_method[ifo].rank_stat_single(
(ifo, sds),
**stat_kwargs
)

triggers_cut[args.sngl_ranking] = sngls_value
triggers_cut['stat'] = sngls_value

triggers_da = DictArray(data=triggers_cut)

Expand Down Expand Up @@ -330,7 +340,7 @@ for ifo in events:
for bin_num in range(n_bins):
inbin = event_bins[ifo] == bin_num

binned_events = events[ifo].data[args.sngl_ranking][inbin]
binned_events = events[ifo].data['stat'][inbin]
binned_event_times = events[ifo].data['end_time'][inbin]

# Cluster triggers in time with the pruning window to ensure
Expand Down Expand Up @@ -396,17 +406,32 @@ for ifo in events:
alphas[ifo][bin_num] = -1
continue

counts[ifo][bin_num] = np.count_nonzero(inbin)
stat_inbin = events[ifo].data['stat'][inbin]
counts[ifo][bin_num] = \
np.count_nonzero(stat_inbin > args.fit_threshold[ifo])

alphas[ifo][bin_num], _ = trstats.fit_above_thresh(
args.fit_function,
events[ifo].data[args.sngl_ranking][inbin],
args.fit_threshold
args.fit_function[ifo],
stat_inbin,
args.fit_threshold[ifo]
)

logging.info("Writing results")
with h5py.File(args.output, 'w') as fout:
for ifo in events:
for ifo in args.ifos:
fout_ifo = fout.create_group(ifo)
fout_ifo.attrs['fit_function'] = args.fit_function[ifo]
fout_ifo.attrs['fit_threshold'] = args.fit_threshold[ifo]
if ifo not in events:
# There were no triggers, but we should still produce some
# information
fout_ifo['fit_coeff'] = -1 * np.zeros(n_bins)
fout_ifo['counts'] = np.zeros(n_bins)
fout_ifo.attrs['live_time'] = live_time[ifo]
fout_ifo.attrs['pruned_times'] = []
fout_ifo.attrs['n_pruned'] = 0
continue

# Save the triggers we have used for the fits
fout_ifo_trigs = fout_ifo.create_group('triggers')
for key in events[ifo].data:
Expand All @@ -427,8 +452,7 @@ with h5py.File(args.output, 'w') as fout:
fout.attrs['analysis_date'] = args.analysis_date
fout.attrs['input'] = sys.argv
fout.attrs['cuts'] = args.template_cuts + args.trigger_cuts
fout.attrs['fit_function'] = args.fit_function
fout.attrs['fit_threshold'] = args.fit_threshold
fout.attrs['sngl_ranking'] = args.sngl_ranking
fout.attrs['ranking_statistic'] = args.ranking_statistic

logging.info("Done")
27 changes: 27 additions & 0 deletions examples/live/make_fit_coeffs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""
Makes files which can be used as the fit_coeffs statistic.
These are not of any scientific use, but the code will accept them
and run properly
"""

import h5py
import numpy as np

# Get number of templates from bank file
with h5py.File('template_bank.hdf', 'r') as bankf:
n_templates = bankf['mass1'].size

for ifo in ['H1','L1','V1']:
with h5py.File(f'{ifo}-fit_coeffs.hdf','w') as fits_f:
fits_f.attrs['analysis_time'] = 430000
fits_f.attrs['ifo'] = ifo
fits_f.attrs['stat'] = f'{ifo}-fit_coeffs'
fits_f.attrs['stat_threshold'] = 5

fits_f['count_above_thresh'] = np.ones(n_templates) * 100
fits_f['count_in_template'] = np.ones(n_templates) * 20000
fits_f['fit_coeff'] = np.ones(n_templates) * 5.5
fits_f['median_sigma'] = np.ones(n_templates) * 5800
fits_f['template_id'] = np.arange(n_templates)


Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import h5py
import numpy as np

f = h5py.File('single_trigger_fits.hdf','w')
f = h5py.File('single_significance_fits.hdf','w')

# Some numbers to design the output
# These are loosely based on the O3a trigger fits file
Expand Down
Loading

0 comments on commit d486ec2

Please sign in to comment.