From 161bcea4a608a370fb06bd6e6d81b77ce0d03638 Mon Sep 17 00:00:00 2001
From: Francesco Pannarale <francesco.pannarale@ligo.org>
Date: Tue, 19 Nov 2024 09:47:30 -0700
Subject: [PATCH] Handling vetoes and timeslides in pygrb mini followups
 (#4941)

* Use veto and segments files in pycbc_pygrb_minifollowups; enforce using the correct timeslide when following triggers/injections

* Added note about pycbc_pygrb_minifollowups in pycbc_pygrb_page_tables

* Updated construct_trials and generalized extract_basic_trig_properties to extract_trig_properties

* Edited var name to something more meaningful

* More accurate docstring

* Aesthetics and missing import

* Simplified pycbc_pygrb_minifollowups and fixed a sign!

* Removed empty line

* f-string and bare except
---
 bin/pygrb/pycbc_pygrb_minifollowups         | 101 +++++++++++++++-----
 bin/pygrb/pycbc_pygrb_page_tables           |   2 +-
 pycbc/results/pygrb_postprocessing_utils.py |  78 +++++++--------
 3 files changed, 115 insertions(+), 66 deletions(-)

diff --git a/bin/pygrb/pycbc_pygrb_minifollowups b/bin/pygrb/pycbc_pygrb_minifollowups
index 78c2b4c97e5..0e172f2cb79 100644
--- a/bin/pygrb/pycbc_pygrb_minifollowups
+++ b/bin/pygrb/pycbc_pygrb_minifollowups
@@ -56,11 +56,23 @@ def add_wiki_row(outfile, cols):
 
 
 def make_timeseries_plot(workflow, trig_file, snr_type, central_time,
-                         shift_time, out_dir, ifo=None, tags=None):
+                         out_dir, ifo=None, seg_files=None,
+                         veto_file=None, tags=None):
     """Adds a node for a timeseries of PyGRB results to the workflow"""
 
     tags = [] if tags is None else tags
 
+    # Ensure that zero-lag data is used in these follow-up plots and store
+    # the slide-id option originally provided
+    orig_slide_id = None
+    if workflow.cp.has_option('pygrb_plot_snr_timeseries','slide-id'):
+        orig_slide_id = workflow.cp.get('pygrb_plot_snr_timeseries','slide-id')
+    workflow.cp.add_options_to_section(
+        'pygrb_plot_snr_timeseries',
+        [('slide-id', '0')],
+        True
+    )
+
     # Initialize job node with its tags
     grb_name = workflow.cp.get('workflow', 'trigger-name')
     extra_tags = ['GRB'+grb_name]
@@ -71,28 +83,42 @@ def make_timeseries_plot(workflow, trig_file, snr_type, central_time,
                           ifos=workflow.ifos, out_dir=out_dir,
                           tags=tags+extra_tags).create_node()
     node.add_input_opt('--trig-file', trig_file)
+    # Include the onsource trial if this is a follow up on the onsource
+    if 'loudest_onsource_event' in tags:
+        node.add_opt('--onsource')
+    # Pass the segments files and veto file
+    if seg_files:
+        node.add_input_list_opt('--seg-files', seg_files)
+    if veto_file:
+        node.add_input_opt('--veto-file', veto_file)
     node.new_output_file_opt(workflow.analysis_time, '.png',
                              '--output-file', tags=extra_tags)
     # Quantity to be displayed on the y-axis of the plot
     node.add_opt('--y-variable', snr_type)
     if ifo is not None:
         node.add_opt('--ifo', ifo)
-    reset_central_time = shift_time - central_time
-    # Horizontal axis range the = prevents errors with negative times
-    x_lims = str(-5.+reset_central_time)+','+str(reset_central_time+5.)
-    node.add_opt('--x-lims='+x_lims)
+    node.add_opt('--x-lims=-5.,5.')
     # Plot title
     if ifo is not None:
         title_str = f"'{ifo} SNR at {central_time:.3f} (s)'"
     else:
-        title_str = "'%s SNR at %.3f (s)'" % (snr_type.capitalize(),
-                                              central_time)
+        title_str = f"'{snr_type.capitalize()} SNR at {central_time:.3f} (s)'"
     node.add_opt('--trigger-time', central_time)
     node.add_opt('--plot-title', title_str)
 
     # Add job node to workflow
     workflow += node
 
+    # Revert the config parser back to how it was given
+    if orig_slide_id:
+        workflow.cp.add_options_to_section(
+            'pygrb_plot_snr_timeseries',
+            [('slide-id', orig_slide_id)],
+            True
+        )
+    else:
+        workflow.cp.remove_option('pygrb_plot_snr_timeseries','slide-id')
+
     return node.output_files
 
 
@@ -107,6 +133,11 @@ parser.add_argument('--followups-file',
                     help="HDF file with the triggers/injections to follow up")
 parser.add_argument('--wiki-file',
                     help="Name of file to save wiki-formatted table in")
+parser.add_argument("-a", "--seg-files", nargs="+", action="store",
+                    default=[], help="The location of the buffer, " +
+                    "onsource and offsource txt segment files.")
+parser.add_argument("-V", "--veto-file", action="store",
+                    help="The location of the xml veto file.")
 wf.add_workflow_command_line_group(parser)
 wf.add_workflow_settings_cli(parser, include_subdax_opts=True)
 ppu.pygrb_add_bestnr_cut_opt(parser)
@@ -142,50 +173,68 @@ trig_file = resolve_url_to_file(os.path.abspath(args.trig_file))
 ifos = ppu.extract_ifos(os.path.abspath(args.trig_file))
 num_ifos = len(ifos)
 
+# File instance of the veto file
+veto_file = args.veto_file
+start_rundir = os.getcwd()
+if veto_file:
+    veto_file = os.path.join(start_rundir, args.veto_file)
+    veto_file = wf.resolve_url_to_file(veto_file)
+
+# Convert the segments files to a FileList
+seg_files = wf.FileList([
+    wf.resolve_url_to_file(os.path.join(start_rundir, f))
+    for f in args.seg_files
+])
+
 # (Loudest) off/on-source events are on time-slid data so the
 # try will succeed, as it finds the time shift columns.
 is_injection_followup = True
 try:
     time_shift = fp[ifos[0]+' time shift (s)'][0]
     is_injection_followup = False
-except:
+except KeyError:
     pass
 
-
 # Loop over triggers/injections to be followed up
 for num_event in range(num_events):
     files = FileList([])
-    logging.info('Processing event: %s', num_event)
+    logging.info('Processing event: %s', num_event+1)
     gps_time = fp['GPS time'][num_event]
     gps_time = gps_time.astype(float)
+    tags = args.tags + [str(num_event+1)]
     if wiki_file:
         row = []
         for key in fp.keys():
             row.append(fp[key][num_event])
         add_wiki_row(wiki_file, row)
-    # Handle off/on-source loudest triggers follow-up (which are on slid data)
-    if not is_injection_followup:
-        for ifo in ifos:
-            time_shift = fp[ifo+' time shift (s)'][num_event]
-            ifo_time = gps_time - time_shift
-            files += make_timeseries_plot(workflow, trig_file,
-                                          'single', gps_time, ifo_time,
-                                          args.output_dir, ifo=ifo,
-                                          tags=args.tags + [str(num_event)])
-            files += mini.make_qscan_plot(workflow, ifo, ifo_time,
-                                          args.output_dir,
-                                          tags=args.tags + [str(num_event)])
     # Handle injections (which are on unslid data)
-    else:
+    if is_injection_followup:
         for snr_type in ['reweighted', 'coherent']:
             files += make_timeseries_plot(workflow, trig_file,
-                                          snr_type, gps_time, gps_time,
+                                          snr_type, gps_time,
                                           args.output_dir, ifo=None,
-                                          tags=args.tags + [str(num_event)])
+                                          seg_files=seg_files,
+                                          veto_file=veto_file,
+                                          tags=tags)
         for ifo in ifos:
             files += mini.make_qscan_plot(workflow, ifo, gps_time,
                                           args.output_dir,
-                                          tags=args.tags + [str(num_event)])
+                                          tags=tags)
+    # Handle off/on-source loudest triggers follow-up (which may be on slid
+    # data in the case of the off-source)
+    else:
+        for i_ifo, ifo in enumerate(ifos):
+            time_shift = fp[ifo+' time shift (s)'][num_event]
+            ifo_time = gps_time + time_shift
+            files += make_timeseries_plot(workflow, trig_file,
+                                          'single', ifo_time,
+                                          args.output_dir, ifo=ifo,
+                                          seg_files=seg_files,
+                                          veto_file=veto_file,
+                                          tags=tags)
+            files += mini.make_qscan_plot(workflow, ifo, ifo_time,
+                                          args.output_dir,
+                                          tags=tags)
 
     layouts += list(layout.grouper(files, 2))
 
diff --git a/bin/pygrb/pycbc_pygrb_page_tables b/bin/pygrb/pycbc_pygrb_page_tables
index fd7ed4ed9bc..6d53fc14efa 100755
--- a/bin/pygrb/pycbc_pygrb_page_tables
+++ b/bin/pygrb/pycbc_pygrb_page_tables
@@ -395,7 +395,7 @@ if lofft_outfile:
         d.append(bestnr)
         td.append(d)
 
-    # th: table header
+    # th: table header [pycbc_pygrb_minifollowups looks for 'Slide Num']
     th = ['Trial', 'Slide Num', 'p-value', 'GPS time',
           'Rec. m1', 'Rec. m2', 'Rec. Mc', 'Rec. spin1z', 'Rec. spin2z',
           'Rec. RA', 'Rec. Dec', 'SNR', 'Chi^2', 'Null SNR']
diff --git a/pycbc/results/pygrb_postprocessing_utils.py b/pycbc/results/pygrb_postprocessing_utils.py
index b29bd1d1ddd..a6da15f26b4 100644
--- a/pycbc/results/pygrb_postprocessing_utils.py
+++ b/pycbc/results/pygrb_postprocessing_utils.py
@@ -23,7 +23,6 @@
 Module to generate PyGRB figures: scatter plots and timeseries.
 """
 
-import os
 import logging
 import argparse
 import copy
@@ -33,6 +32,7 @@
 from scipy import stats
 import ligo.segments as segments
 from pycbc.events.coherent import reweightedsnr_cut
+from pycbc.events import veto
 from pycbc import add_common_pycbc_options
 from pycbc.io.hdf import HFile
 
@@ -242,7 +242,7 @@ def _extract_vetoes(veto_file, ifos, offsource):
             vetoes[ifo] = segments.segmentlist([offsource]) - clean_segs[ifo]
         vetoes.coalesce()
         for ifo in ifos:
-            for v in vetoes[ifo]: 
+            for v in vetoes[ifo]:
                 v_span = v[1] - v[0]
                 logging.info("%ds of data vetoed at GPS time %d",
                              v_span, v[0])
@@ -269,7 +269,7 @@ def _slide_vetoes(vetoes, slide_dict_or_list, slide_id, ifos):
 
 
 # =============================================================================
-# Recursive function to reach all datasets in an HDF file handle 
+# Recursive function to reach all datasets in an HDF file handle
 # =============================================================================
 def _dataset_iterator(g, prefix=''):
     """Reach all datasets in an HDF file handle"""
@@ -368,9 +368,6 @@ def load_data(input_file, ifos, rw_snr_threshold=None, data_tag=None,
     return trigs_dict
 
 
-
-
-
 # =============================================================================
 # Detector utils:
 # * Function to calculate the antenna response F+^2 + Fx^2
@@ -460,38 +457,35 @@ def sort_trigs(trial_dict, trigs, slide_dict, seg_dict):
 # =============================================================================
 # Extract trigger properties and store them as dictionaries
 # =============================================================================
-def extract_basic_trig_properties(trial_dict, trigs, slide_dict, seg_dict,
-                                  opts):
-    """Extract and store as dictionaries time, SNR, and BestNR of
-    time-slid triggers"""
+def extract_trig_properties(trial_dict, trigs, slide_dict, seg_dict, keys):
+    """Extract and store as dictionaries specific keys of time-slid
+    triggers (trigs) compatibly with the trials dictionary (trial_dict)"""
 
     # Sort the triggers into each slide
     sorted_trigs = sort_trigs(trial_dict, trigs, slide_dict, seg_dict)
-    logger.info("Triggers sorted.")
+    n_surviving_trigs = sum([len(i) for i in sorted_trigs.values()])
+    msg = f"{n_surviving_trigs} triggers found within the trials dictionary "
+    msg += "and sorted."
+    logging.info(msg)
+
+    found_trigs = {}
+    for key in keys:
+        found_trigs[key] = {}
 
-    # Build the 3 dictionaries
-    trig_time = {}
-    trig_snr = {}
-    trig_bestnr = {}
     for slide_id in slide_dict:
         slide_trigs = sorted_trigs[slide_id]
         indices = numpy.nonzero(
             numpy.isin(trigs['network/event_id'], slide_trigs))[0]
-        if slide_trigs:
-            trig_time[slide_id] = trigs['network/end_time_gc'][
-                indices]
-            trig_snr[slide_id] = trigs['network/coherent_snr'][
-                indices]
-        else:
-            trig_time[slide_id] = numpy.asarray([])
-            trig_snr[slide_id] = numpy.asarray([])
-        trig_bestnr[slide_id] = reweightedsnr_cut(
-            trigs['network/reweighted_snr'][indices],
-            opts.newsnr_threshold)
+        for key in keys:
+            if slide_trigs:
+                found_trigs[key][slide_id] = get_coinc_snr(trigs)[indices] \
+                    if key == 'network/coincident_snr' else trigs[key][indices]
+            else:
+                found_trigs[key][slide_id] = numpy.asarray([])
 
-    logger.info("Time, SNR, and BestNR of triggers extracted.")
+    logging.info("Triggers information extracted.")
 
-    return trig_time, trig_snr, trig_bestnr
+    return found_trigs
 
 
 # =============================================================================
@@ -582,9 +576,11 @@ def load_segment_dict(hdf_file_path):
 # =============================================================================
 # Construct the trials from the timeslides, segments, and vetoes
 # =============================================================================
-def construct_trials(seg_files, seg_dict, ifos, slide_dict, vetoes):
-    """Constructs trials from triggers, timeslides, segments and vetoes"""
+def construct_trials(seg_files, seg_dict, ifos, slide_dict, veto_file,
+                     hide_onsource=True):
+    """Constructs trials from segments, timeslides, and vetoes"""
 
+    logging.info("Constructing trials.")
     trial_dict = {}
 
     # Get segments
@@ -593,19 +589,23 @@ def construct_trials(seg_files, seg_dict, ifos, slide_dict, vetoes):
     # Separate segments
     trial_time = abs(segs['on'])
 
+    # Determine the veto segments
+    vetoes = _extract_vetoes(veto_file, ifos, segs['off'])
+
+    # Slide vetoes over trials: this can only *reduce* the analysis time
     for slide_id in slide_dict:
-        # These can only *reduce* the analysis time
         curr_seg_list = seg_dict[slide_id]
 
-        # Construct the buffer segment list
+        # Fill in the buffer segment list if the onsource data must be hidden
         seg_buffer = segments.segmentlist()
-        for ifo in ifos:
-            slide_offset = slide_dict[slide_id][ifo]
-            seg_buffer.append(segments.segment(segs['buffer'][0] -
-                                               slide_offset,
-                                               segs['buffer'][1] -
-                                               slide_offset))
-        seg_buffer.coalesce()
+        if hide_onsource:
+            for ifo in ifos:
+                slide_offset = slide_dict[slide_id][ifo]
+                seg_buffer.append(segments.segment(segs['buffer'][0] -
+                                                   slide_offset,
+                                                   segs['buffer'][1] -
+                                                   slide_offset))
+            seg_buffer.coalesce()
 
         # Construct the ifo-indexed dictionary of slid veteoes
         slid_vetoes = _slide_vetoes(vetoes, slide_dict, slide_id, ifos)