Skip to content

Commit

Permalink
Updated selection (added tagID cut), reweithing and plot scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Caio Cesar Daumann committed Dec 21, 2023
1 parent d17203b commit 9336d35
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 63 deletions.
101 changes: 64 additions & 37 deletions data_reading/read_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def perform_zee_selection( data_df, mc_df ):
# first we need to calculathe the invariant mass of the electron pair

# variables names used to calculate the mass
mass_vars = ["tag_pt","tag_ScEta","tag_phi","probe_pt","probe_ScEta","probe_phi","fixedGridRhoAll"]
mass_vars = ["tag_pt","tag_ScEta","tag_phi","probe_pt","probe_ScEta","probe_phi","fixedGridRhoAll", "tag_mvaID"]

mass_inputs_data = np.array( data_df[mass_vars])
mass_inputs_mc = np.array( mc_df[mass_vars] )
Expand All @@ -27,12 +27,13 @@ def perform_zee_selection( data_df, mc_df ):

# now, in order to perform the needed cuts two masks will be created
mask_data = np.logical_and( mass_data > 80 , mass_data < 100 )
mask_data = np.logical_and( mask_data , mass_inputs_data[:,0] > 40 )
mask_data = np.logical_and( mask_data , mass_inputs_data[:,3] > 20 )
mask_data = np.logical_and( mask_data , np.abs(mass_inputs_data[:,4]) < 2.5 )
mask_data = np.logical_and( mask_data , np.abs(mass_inputs_data[:,5]) < 3.1415 )
mask_data = np.logical_and( mask_data , mass_inputs_data[:,0] > 40 ) #tag pt cut
mask_data = np.logical_and( mask_data , mass_inputs_data[:,3] > 20 ) #probe pt cut
mask_data = np.logical_and( mask_data , np.abs(mass_inputs_data[:,4]) < 2.5 ) # eta cut
mask_data = np.logical_and( mask_data , np.abs(mass_inputs_data[:,5]) < 3.1415 ) # phi cut
mask_data = np.logical_and( mask_data , mass_inputs_data[:,6] < 100 )
mask_data = np.logical_and( mask_data , mass_inputs_data[:,6] > 0 )
mask_data = np.logical_and( mask_data , mass_inputs_data[:,7] > 0.0 ) # tag mvaID cut


mask_mc = np.logical_and( mass_mc > 80 , mass_mc < 100 )
Expand All @@ -42,6 +43,7 @@ def perform_zee_selection( data_df, mc_df ):
mask_mc = np.logical_and( mask_mc , np.abs(mass_inputs_mc[:,5]) < 3.1415 )
mask_mc = np.logical_and( mask_mc , mass_inputs_mc[:,6] > 0 )
mask_mc = np.logical_and( mask_mc , mass_inputs_mc[:,6] < 100 )
mask_mc = np.logical_and( mask_mc , mass_inputs_mc[:,7] > 0.0 )

# return the masks for further operations
return mask_data, mask_mc
Expand All @@ -65,7 +67,7 @@ def perform_zee_kinematics_reweighting(data_array, data_weights, mc_array, mc_we
phi_min, phi_max = -3.15,3.15

# now the number of bins in each distribution
pt_bins,rho_bins,eta_bins,phi_bins = 20, 30, 10, 4
pt_bins,rho_bins,eta_bins,phi_bins = 10, 30, 10, 2 #was 20, 30, 10, 4

# Now we create a 4d histogram of this kinematic variables
mc_histo, edges = np.histogramdd( sample = (mc_array[:,0] , mc_array[:,3], mc_array[:,1], mc_array[:,2]) , bins = (pt_bins,rho_bins,eta_bins,phi_bins), range = [ [pt_min,pt_max], [ rho_min, rho_max ],[eta_min,eta_max], [phi_min,phi_max] ], weights = mc_weights )
Expand Down Expand Up @@ -129,33 +131,36 @@ def separate_training_data( data_df, mc_df, mc_weights, data_weights, input_vars
assert len( mc_weights ) == len( data_weights )
assert len( mc_conditions ) == len(data_conditions)

training_percent = 0.7
validation_percent = 0.05 + training_percent
testing_percet = 1 - training_percent - validation_percent

# Now, the fun part! - Separating everyhting into trainnig, validation and testing datasets!
data_training_inputs = data_inputs[: int( 0.6*len(data_inputs )) ]
data_training_conditions = data_conditions[: int( 0.6*len(data_inputs )) ]
data_training_weights = data_weights[: int( 0.6*len(data_inputs )) ]
data_training_inputs = data_inputs[: int( training_percent*len(data_inputs )) ]
data_training_conditions = data_conditions[: int( training_percent*len(data_inputs )) ]
data_training_weights = data_weights[: int( training_percent*len(data_inputs )) ]

mc_training_inputs = mc_inputs[: int( 0.6*len(mc_inputs )) ]
mc_training_conditions = mc_conditions[: int( 0.6*len(mc_inputs )) ]
mc_training_weights = mc_weights[: int( 0.6*len(mc_inputs )) ]
mc_training_inputs = mc_inputs[: int( training_percent*len(mc_inputs )) ]
mc_training_conditions = mc_conditions[: int( training_percent*len(mc_inputs )) ]
mc_training_weights = mc_weights[: int( training_percent*len(mc_inputs )) ]

# now, the validation dataset
data_validation_inputs = data_inputs[int( 0.6*len(data_inputs )):int( 0.65*len(data_inputs )) ]
data_validation_conditions = data_conditions[int( 0.6*len(data_inputs )):int( 0.65*len(data_inputs )) ]
data_validation_weights = data_weights[int( 0.6*len(data_inputs )):int( 0.65*len(data_inputs )) ]
data_validation_inputs = data_inputs[int(training_percent*len(data_inputs )):int( validation_percent*len(data_inputs )) ]
data_validation_conditions = data_conditions[int(training_percent*len(data_inputs )):int( validation_percent*len(data_inputs )) ]
data_validation_weights = data_weights[int(training_percent*len(data_inputs )):int( validation_percent*len(data_inputs )) ]

mc_validation_inputs = mc_inputs[int( 0.6*len(mc_inputs )):int( 0.65*len(mc_inputs )) ]
mc_validation_conditions = mc_conditions[int( 0.6*len(mc_inputs )):int( 0.65*len(mc_inputs )) ]
mc_validation_weights = mc_weights[int( 0.6*len(mc_inputs )):int( 0.65*len(mc_inputs )) ]
mc_validation_inputs = mc_inputs[int(training_percent*len(mc_inputs )):int( validation_percent*len(mc_inputs )) ]
mc_validation_conditions = mc_conditions[int(training_percent*len(mc_inputs )):int( validation_percent*len(mc_inputs )) ]
mc_validation_weights = mc_weights[int(training_percent*len(mc_inputs )):int( validation_percent*len(mc_inputs )) ]

# now for the grand finalle, the test tensors
data_test_inputs = data_inputs[int( 0.65*len(data_inputs )): ]
data_test_conditions = data_conditions[int( 0.65*len(data_inputs )): ]
data_test_weights = data_weights[int( 0.65*len(data_inputs )): ]
data_test_inputs = data_inputs[int( validation_percent*len(data_inputs )): ]
data_test_conditions = data_conditions[int( validation_percent*len(data_inputs )): ]
data_test_weights = data_weights[int( validation_percent*len(data_inputs )): ]

mc_test_inputs = mc_inputs[int( 0.65*len(mc_inputs )): ]
mc_test_conditions = mc_conditions[int( 0.65*len(mc_inputs )):]
mc_test_weights = mc_weights[int( 0.65*len(mc_inputs )):]
mc_test_inputs = mc_inputs[int( validation_percent*len(mc_inputs )): ]
mc_test_conditions = mc_conditions[int( validation_percent*len(mc_inputs )):]
mc_test_weights = mc_weights[int( validation_percent*len(mc_inputs )):]

# now, all the tensors are saved so they can be read by the training class
path_to_save_tensors = "/net/scratch_cms3a/daumann/PhD/EarlyHgg/simulation_to_data_corrections/data_reading/saved_tensors/zee_tensors/"
Expand Down Expand Up @@ -191,8 +196,6 @@ def separate_training_data( data_df, mc_df, mc_weights, data_weights, input_vars

def read_zee_data():

path_to_data = "/net/scratch_cms3a/daumann/nanoAODv12_Production/"

# We want to correct the variables that are used as input to run3 photon MVA ID
var_list = ["probe_energyRaw",
"probe_r9",
Expand All @@ -215,18 +218,41 @@ def read_zee_data():
# These variables will be used as conditions to the normalizing flow - they will not be transformed!
conditions_list = [ "probe_pt","probe_ScEta","probe_phi","fixedGridRhoAll"]

# Lets now read the data and simultion as pandas dataframes
files_DY_mc = glob.glob( path_to_data + "DY_postEE_v12/nominal/*.parquet")
files_DY_mc = files_DY_mc[:200]
simulation = [pd.read_parquet(f) for f in files_DY_mc]
drell_yan_df = pd.concat(simulation,ignore_index=True)
# If true read nanoAODv13 (high precision) , else v12 (low precision)
read_nano_v13 = True
if( read_nano_v13 ):

path_to_data = "/net/scratch_cms3a/daumann/HiggsDNA/v13_samples_2/"

# Lets now read the data and simultion as pandas dataframes
files_DY_mc = glob.glob( "/net/scratch_cms3a/daumann/HiggsDNA/v13_samples_2/DY_postEE_v13/nominal/*.parquet")
files_DY_mc = files_DY_mc[:300]
simulation = [pd.read_parquet(f) for f in files_DY_mc]
drell_yan_df = pd.concat(simulation,ignore_index=True)

# now the data files for the epochs F and G
files_DY_data_F = glob.glob( "/net/scratch_cms3a/daumann/HiggsDNA/v13_samples_2/dataG_v13/nominal/*.parquet")
files_DY_data_F = files_DY_data_F[:300]

files_DY_data_G = glob.glob( "/net/scratch_cms3a/daumann/HiggsDNA/v13_samples_2/dataG_v13/nominal/*.parquet")
files_DY_data_G = files_DY_data_G[:300]

else:

path_to_data = "/net/scratch_cms3a/daumann/nanoAODv12_Production/"

# Lets now read the data and simultion as pandas dataframes
files_DY_mc = glob.glob( path_to_data + "DY_postEE_v12/nominal/*.parquet")
files_DY_mc = files_DY_mc[:20]
simulation = [pd.read_parquet(f) for f in files_DY_mc]
drell_yan_df = pd.concat(simulation,ignore_index=True)

# now the data files for the epochs F and G
files_DY_data_F = glob.glob( "/net/scratch_cms3a/daumann/nanoAODv12_Production/Data_Run2022F_v12/nominal/*.parquet")
files_DY_data_F = files_DY_data_F[:300]
# now the data files for the epochs F and G
files_DY_data_F = glob.glob( "/net/scratch_cms3a/daumann/nanoAODv12_Production/Data_Run2022F_v12/nominal/*.parquet")
files_DY_data_F = files_DY_data_F[:30]

files_DY_data_G = glob.glob( "/net/scratch_cms3a/daumann/nanoAODv12_Production/Data_Run2022G_v12/nominal/*.parquet")
files_DY_data_G = files_DY_data_G[:300]
files_DY_data_G = glob.glob( "/net/scratch_cms3a/daumann/nanoAODv12_Production/Data_Run2022G_v12/nominal/*.parquet")
files_DY_data_G = files_DY_data_G[:30]

# merhging both dataframes
files_DY_data = [files_DY_data_G,files_DY_data_F]
Expand All @@ -245,14 +271,15 @@ def read_zee_data():

# now, due to diferences in kinematics, a rewighting in the four kinematic variables [pt,eta,phi and rho] will be perform
mc_weights = drell_yan_df["weight"]
mc_weights_before = mc_weights
data_weights = np.ones( len( data_df["fixedGridRhoAll"] ) )
data_weights, mc_weights = perform_zee_kinematics_reweighting(data_df[conditions_list][mask_data], data_weights[mask_data], drell_yan_df[conditions_list][mask_mc], mc_weights[mask_mc])

# now lets call a plotting function to perform the plots of the read distributions for validation porpuses
path_to_plots = "/net/scratch_cms3a/daumann/PhD/EarlyHgg/simulation_to_data_corrections/plot/validation_plots/"

# now as a last step, we need to split the data into training, validation and test dataset
plot_utils.plot_distributions( path_to_plots, data_df[mask_data], drell_yan_df[mask_mc], mc_weights, [var_list, conditions_list] )
plot_utils.plot_distributions( path_to_plots, data_df[mask_data], drell_yan_df[mask_mc], mc_weights , [var_list, conditions_list], weights_befores_rw = mc_weights_before[mask_mc])

# now, the datsets will be separated into training, validation and test dataset, and saved for further reading by the training class!
separate_training_data(data_df[mask_data], drell_yan_df[mask_mc], mc_weights, data_weights, var_list, conditions_list)
Expand Down
4 changes: 4 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
import plot.plot_utils as plot_utils
import normalizing_flows.training_utils as training_utils

def test_big_boy():
assert 1 == 1


def main():
print("Welcome to the simulation corrections 2000!")

Expand Down
20 changes: 14 additions & 6 deletions normalizing_flows/training_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ def perform_transformation(self):
for index in self.indexes_for_iso_transform:

# since hoe has very low values, the shift value (value until traingular events are sampled) must be diferent here
if( index == 7 ):
if( index == 6 ):
self.vector_for_iso_constructors_data.append( Make_iso_continuous(self.data_training_inputs[:,index], b = 0.001) )
self.vector_for_iso_constructors_mc.append( Make_iso_continuous(self.mc_training_inputs[:,index] , b= 0.001) )
else:
Expand Down Expand Up @@ -252,14 +252,22 @@ def perform_transformation(self):
self.validation_conditions = torch.cat([ self.data_validation_conditions, self.mc_validation_conditions], axis = 0).to(self.device).to(self.device)
self.validation_weights = torch.tensor(np.concatenate([ self.data_validation_weights, self.mc_validation_weights], axis = 0)).to(self.device)

# We now perform the standartization of the training and validation arrays
self.input_mean_for_std = torch.mean( self.training_inputs, 0 )
self.input_std_for_std = torch.std( self.training_inputs, 0 )
# We now perform the standartization of the training and validation arrays - lets use only mc to calculate the means and stuff ...
self.input_mean_for_std = torch.mean( self.training_inputs[ self.training_conditions[:, self.training_conditions.size()[1] -1 ] == 0 ], 0 )
self.input_std_for_std = torch.std( self.training_inputs[ self.training_conditions[:, self.training_conditions.size()[1] -1 ] == 0 ], 0 )

# the last element of the condition tensor is a boolean, so of couse we do not transform that xD
self.condition_mean_for_std = torch.mean( self.training_conditions[:,:-1], 0 )
self.condition_std_for_std = torch.std( self.training_conditions[:,:-1], 0 )
self.condition_mean_for_std = torch.mean( self.training_conditions[:,:-1][ self.training_conditions[:, self.training_conditions.size()[1] -1 ] == 0 ], 0 )
self.condition_std_for_std = torch.std( self.training_conditions[:,:-1][ self.training_conditions[:, self.training_conditions.size()[1] -1 ] == 0 ], 0 )

"""
print( self.input_mean_for_std )
print( self.input_std_for_std )
print( self.condition_mean_for_std )
print( self.condition_std_for_std )
exit()
"""

# transorming the training tensors
self.training_inputs = ( self.training_inputs - self.input_mean_for_std )/self.input_std_for_std
self.training_conditions[:,:-1] = ( self.training_conditions[:,:-1] - self.condition_mean_for_std )/self.condition_std_for_std
Expand Down
44 changes: 24 additions & 20 deletions plot/plot_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,9 +163,9 @@ def plott(data_hist,mc_hist,mc_rw_hist ,output_filename,xlabel,region=None ):
ax[0].text(0.05, 0.68, r"$p_\mathrm{T}(Z)$: " + region.split("_ZpT_")[1].replace("_", "-") + "$\,$GeV", fontsize=22, transform=ax[0].transAxes)
ax[0].tick_params(labelsize=24)
#ax.set_ylim(0., 1.1*ax.get_ylim()[1])
ax[1].set_ylim(0.6, 1.4)
ax[1].set_ylim(0.71, 1.29)
if( 'mva' in xlabel ):
ax[1].set_ylim(0.75, 1.25)
ax[1].set_ylim(0.79, 1.21)

ax[0].legend(
loc="upper right", fontsize=24
Expand All @@ -182,7 +182,7 @@ def plott(data_hist,mc_hist,mc_rw_hist ,output_filename,xlabel,region=None ):
return 0

# This is a plot distribution suitable for dataframes, if one wants to plot tensors see "plot_distributions_for_tensors()"
def plot_distributions( path, data_df, mc_df, mc_weights, variables_to_plot ):
def plot_distributions( path, data_df, mc_df, mc_weights, variables_to_plot, weights_befores_rw = False ):

for set in variables_to_plot:

Expand All @@ -205,7 +205,11 @@ def plot_distributions( path, data_df, mc_df, mc_weights, variables_to_plot ):
mc_rw_hist = hist.Hist(hist.axis.Regular(70, mean - 2.0*std, mean + 2.0*std))

data_hist.fill( np.array(data_df[key] ) )
mc_hist.fill( np.array( mc_df[key]) )

if( len(weights_befores_rw) ):
mc_hist.fill( np.array( mc_df[key]), weight = weights_befores_rw )
else:
mc_hist.fill( np.array( mc_df[key]) )

#print( np.shape( np.array(drell_yan_df[key]) ) , np.shape( mc_weights ) )

Expand All @@ -220,19 +224,19 @@ def plot_distributions_for_tensors( data_tensor, mc_tensor, flow_samples, mc_wei
mean = np.mean( np.array(data_tensor[:,i]) )
std = np.std( np.array(data_tensor[:,i]) )

if( 'Iso' in str(var_list[i]) or 'DR' in str(var_list[i]) or 'esE' in str(var_list[i]) or 'hoe' in str(var_list[i]) ):
data_hist = hist.Hist(hist.axis.Regular(70, 0.0 , mean + 2.0*std))
mc_hist = hist.Hist(hist.axis.Regular(70, 0.0 , mean + 2.0*std))
mc_rw_hist = hist.Hist(hist.axis.Regular(70, 0.0 , mean + 2.0*std))
if( 'Iso' in str(var_list[i]) or 'DR' in str(var_list[i]) or 'esE' in str(var_list[i]) or 'hoe' in str(var_list[i]) or 'energy' in str(var_list[i]) ):
data_hist = hist.Hist(hist.axis.Regular(45, 0.0 , mean + 2.0*std))
mc_hist = hist.Hist(hist.axis.Regular(45, 0.0 , mean + 2.0*std))
mc_rw_hist = hist.Hist(hist.axis.Regular(45, 0.0 , mean + 2.0*std))
else:
data_hist = hist.Hist(hist.axis.Regular(70, mean - 2.0*std, mean + 2.0*std))
mc_hist = hist.Hist(hist.axis.Regular(70, mean - 2.0*std, mean + 2.0*std))
mc_rw_hist = hist.Hist(hist.axis.Regular(70, mean - 2.0*std, mean + 2.0*std))
data_hist = hist.Hist(hist.axis.Regular(45, mean - 2.5*std, mean + 2.5*std))
mc_hist = hist.Hist(hist.axis.Regular(45, mean - 2.5*std, mean + 2.5*std))
mc_rw_hist = hist.Hist(hist.axis.Regular(45, mean - 2.5*std, mean + 2.5*std))

if( 'DR04' in str(var_list[i]) ):
data_hist = hist.Hist(hist.axis.Regular(70, 0.0 , 5.0))
mc_hist = hist.Hist(hist.axis.Regular(70, 0.0 , 5.0))
mc_rw_hist = hist.Hist(hist.axis.Regular(70, 0.0 , 5.0))
data_hist = hist.Hist(hist.axis.Regular(50, 0.0 , 5.0))
mc_hist = hist.Hist(hist.axis.Regular(50, 0.0 , 5.0))
mc_rw_hist = hist.Hist(hist.axis.Regular(50, 0.0 , 5.0))

data_hist.fill( np.array(data_tensor[:,i] ) )
mc_hist.fill( np.array( mc_tensor[:,i]), weight = 1e6*mc_weights )
Expand Down Expand Up @@ -295,9 +299,9 @@ def plot_mvaID_curve(mc_inputs,data_inputs,nl_inputs, mc_conditions, data_condit
plot_profile_barrel( nl_mvaID, mc_mvaID ,mc_conditions, data_mvaID, data_conditions, mc_weights, data_weights, plot_path)

# now, we create and fill the histograms with the mvaID distributions
mc_mva = hist.Hist(hist.axis.Regular(70, -0.9, 1.0))
nl_mva = hist.Hist(hist.axis.Regular(70, -0.9, 1.0))
data_mva = hist.Hist(hist.axis.Regular(70, -0.9, 1.0))
mc_mva = hist.Hist(hist.axis.Regular(42, -0.9, 1.0))
nl_mva = hist.Hist(hist.axis.Regular(42, -0.9, 1.0))
data_mva = hist.Hist(hist.axis.Regular(42, -0.9, 1.0))

mc_mva.fill( mc_mvaID, weight = (1e6)*mc_weights )
nl_mva.fill( nl_mvaID, weight = (1e6)*mc_weights )
Expand Down Expand Up @@ -351,9 +355,9 @@ def plot_mvaID_curve_endcap(mc_inputs,data_inputs,nl_inputs, mc_conditions, data
plot_profile_endcap( nl_mvaID, mc_mvaID ,mc_conditions, data_mvaID, data_conditions, mc_weights, data_weights, plot_path)

# now, we create and fill the histograms with the mvaID distributions
mc_mva = hist.Hist(hist.axis.Regular(70, -0.9, 1.0))
nl_mva = hist.Hist(hist.axis.Regular(70, -0.9, 1.0))
data_mva = hist.Hist(hist.axis.Regular(70, -0.9, 1.0))
mc_mva = hist.Hist(hist.axis.Regular(30, -0.9, 1.0))
nl_mva = hist.Hist(hist.axis.Regular(30, -0.9, 1.0))
data_mva = hist.Hist(hist.axis.Regular(30, -0.9, 1.0))

mc_mva.fill( mc_mvaID, weight = (1e6)*mc_weights )
nl_mva.fill( nl_mvaID, weight = (1e6)*mc_weights )
Expand Down

0 comments on commit 9336d35

Please sign in to comment.