diff --git a/Train/train_DeepCSV.py b/Train/train_DeepCSV.py new file mode 100644 index 0000000..45b3ce4 --- /dev/null +++ b/Train/train_DeepCSV.py @@ -0,0 +1,39 @@ + +#import sys +#import tensorflow as tf +#sys.modules["keras"] = tf.keras + +from DeepJetCore.training.training_base import training_base +from DeepJetCore.modeltools import fixLayersContaining,printLayerInfosAndWeights + + +#also does all the parsing +train=training_base(testrun=False) + +newtraining= not train.modelSet() +#for recovering a training +if newtraining: + from models import model_deepCSV + + train.setModel(model_deepCSV,dropoutRate=0.1) + + #train.keras_model=fixLayersContaining(train.keras_model, 'regression', invert=False) + + train.train_data.maxFilesOpen=1 + +train.compileModel(learningrate=0.003, + loss='categorical_crossentropy', + metrics=['accuracy']) + +print(train.keras_model.summary()) +#printLayerInfosAndWeights(train.keras_model) + +model,history = train.trainModel(nepochs=50, + batchsize=10000, + stop_patience=300, + lr_factor=0.5, + lr_patience=-1, + lr_epsilon=0.0001, + lr_cooldown=10, + lr_minimum=0.00001, + verbose=1,checkperiod=1) diff --git a/env.sh b/env.sh index 31e8538..61c11c8 100644 --- a/env.sh +++ b/env.sh @@ -7,6 +7,8 @@ export DEEPJETCORE_SUBPACKAGE=$DJSUBPACKAGE cd $DJSUBPACKAGE export PYTHONPATH=$DJSUBPACKAGE/modules:$PYTHONPATH export PYTHONPATH=$DJSUBPACKAGE/modules/datastructures:$PYTHONPATH +export PYTHONPATH=$DJSUBPACKAGE/modules/models:$PYTHONPATH + export PATH=$DJSUBPACKAGE/scripts:$PATH export LD_LIBRARY_PATH=$DJSUBPACKAGE/modules/compiled:$LD_LIBRARY_PATH diff --git a/modules/datastructures/TrainData_deepFlavour.py b/modules/datastructures/TrainData_deepFlavour.py index 1c37727..0b86328 100644 --- a/modules/datastructures/TrainData_deepFlavour.py +++ b/modules/datastructures/TrainData_deepFlavour.py @@ -2,7 +2,7 @@ from DeepJetCore.TrainData import TrainData, fileTimeOut import numpy as np - +from argparse import ArgumentParser class TrainData_DF(TrainData): @@ -10,6 +10,7 @@ def __init__(self): TrainData.__init__(self) + self.description = "DeepJet training datastructure" self.truth_branches = ['isB','isBB','isGBB','isLeptonicB','isLeptonicB_C','isC','isGCC','isCC','isUD','isS','isG'] @@ -18,7 +19,6 @@ def __init__(self): self.weightbranchY='jet_eta' self.remove = True self.referenceclass='isB' - self.weight_binX = np.array([ 10,25,30,35,40,45,50,60,75,100, 125,150,175,200,250,300,400,500, @@ -126,27 +126,29 @@ def convertFromSourceFile(self, filename, weighterobjects, istraining): from DeepJetCore.stopwatch import stopwatch sw=stopwatch() swall=stopwatch() + if not istraining: + self.remove = False def reduceTruth(uproot_arrays): - b = uproot_arrays['isB'] + b = uproot_arrays[b'isB'] - bb = uproot_arrays['isBB'] - gbb = uproot_arrays['isGBB'] + bb = uproot_arrays[b'isBB'] + gbb = uproot_arrays[b'isGBB'] - bl = uproot_arrays['isLeptonicB'] - blc = uproot_arrays['isLeptonicB_C'] + bl = uproot_arrays[b'isLeptonicB'] + blc = uproot_arrays[b'isLeptonicB_C'] lepb = bl+blc - c = uproot_arrays['isC'] - cc = uproot_arrays['isCC'] - gcc = uproot_arrays['isGCC'] + c = uproot_arrays[b'isC'] + cc = uproot_arrays[b'isCC'] + gcc = uproot_arrays[b'isGCC'] - ud = uproot_arrays['isUD'] - s = uproot_arrays['isS'] + ud = uproot_arrays[b'isUD'] + s = uproot_arrays[b'isS'] uds = ud+s - g = uproot_arrays['isG'] + g = uproot_arrays[b'isG'] return np.vstack((b,bb+gbb,lepb,c+cc+gcc,uds,g)).transpose() @@ -205,7 +207,6 @@ def reduceTruth(uproot_arrays): stop = None, branches = b ) - print weighterobjects notremoves=weighterobjects['weigther'].createNotRemoveIndices(for_remove) undef=for_remove['isUndefined'] notremoves-=undef @@ -240,3 +241,252 @@ def writeOutPrediction(self, predicted, features, truth, weights, outfilename, i out = np.core.records.fromarrays(np.vstack( (predicted[0].transpose(),truth[0].transpose(), features[0][:,0:2].transpose() ) ), names='prob_isB, prob_isBB,prob_isLeptB, prob_isC,prob_isUDS,prob_isG,isB, isBB, isLeptB, isC,isUDS,isG,jet_pt, jet_eta') array2root(out, outfilename, 'tree') + + + +class TrainData_DeepCSV(TrainData): + def __init__(self): + + TrainData.__init__(self) + + self.description = "DeepCSV training datastructure" + + self.truth_branches = ['isB','isBB','isGBB','isLeptonicB','isLeptonicB_C','isC','isGCC','isCC','isUD','isS','isG'] + self.undefTruth=['isUndefined'] + self.weightbranchX='jet_pt' + self.weightbranchY='jet_eta' + self.remove = True + self.referenceclass='isB' + self.weight_binX = np.array([ + 10,25,30,35,40,45,50,60,75,100, + 125,150,175,200,250,300,400,500, + 600,2000],dtype=float) + + self.weight_binY = np.array( + [-2.5,-2.,-1.5,-1.,-0.5,0.5,1,1.5,2.,2.5], + dtype=float + ) + + self.global_branches = ['jet_pt', 'jet_eta', + 'TagVarCSV_trackSumJetEtRatio', + 'TagVarCSV_trackSumJetDeltaR', + 'TagVarCSV_vertexCategory', + 'TagVarCSV_trackSip2dValAboveCharm', + 'TagVarCSV_trackSip2dSigAboveCharm', + 'TagVarCSV_trackSip3dValAboveCharm', + 'TagVarCSV_trackSip3dSigAboveCharm', + 'TagVarCSV_jetNSelectedTracks', + 'TagVarCSV_jetNTracksEtaRel'] + + self.track_branches = ['TagVarCSVTrk_trackJetDistVal', + 'TagVarCSVTrk_trackPtRel', + 'TagVarCSVTrk_trackDeltaR', + 'TagVarCSVTrk_trackPtRatio', + 'TagVarCSVTrk_trackSip3dSig', + 'TagVarCSVTrk_trackSip2dSig', + 'TagVarCSVTrk_trackDecayLenVal'] + self.n_track = 6 + + self.eta_rel_branches = ['TagVarCSV_trackEtaRel'] + self.n_eta_rel = 4 + + self.vtx_branches = ['TagVarCSV_vertexMass', + 'TagVarCSV_vertexNTracks', + 'TagVarCSV_vertexEnergyRatio', + 'TagVarCSV_vertexJetDeltaR', + 'TagVarCSV_flightDistance2dVal', + 'TagVarCSV_flightDistance2dSig', + 'TagVarCSV_flightDistance3dVal', + 'TagVarCSV_flightDistance3dSig'] + self.n_vtx = 1 + + self.reduced_truth = ['isB','isBB','isC','isUDSG'] + + def readTreeFromRootToTuple(self, filenames, limit=None, branches=None): + ''' + To be used to get the initial tupel for further processing in inherting classes + Makes sure the number of entries is properly set + + can also read a list of files (e.g. to produce weights/removes from larger statistics + (not fully tested, yet) + ''' + + if branches is None or len(branches) == 0: + return np.array([],dtype='float32') + + #print(branches) + #remove duplicates + usebranches=list(set(branches)) + tmpbb=[] + for b in usebranches: + if len(b): + tmpbb.append(b) + usebranches=tmpbb + + import ROOT + from root_numpy import tree2array, root2array + if isinstance(filenames, list): + for f in filenames: + fileTimeOut(f,120) + print('add files') + nparray = root2array( + filenames, + treename = "deepntuplizer/tree", + stop = limit, + branches = usebranches + ) + print('done add files') + return nparray + print('add files') + else: + fileTimeOut(filenames,120) #give eos a minute to recover + rfile = ROOT.TFile(filenames) + tree = rfile.Get(self.treename) + if not self.nsamples: + self.nsamples=tree.GetEntries() + nparray = tree2array(tree, stop=limit, branches=usebranches) + return nparray + + def createWeighterObjects(self, allsourcefiles): + # + # Calculates the weights needed for flattening the pt/eta spectrum + + from DeepJetCore.Weighter import Weighter + weighter = Weighter() + weighter.undefTruth = self.undefTruth + branches = [self.weightbranchX,self.weightbranchY] + branches.extend(self.truth_branches) + + if self.remove: + weighter.setBinningAndClasses( + [self.weight_binX,self.weight_binY], + self.weightbranchX,self.weightbranchY, + self.truth_branches + ) + + + counter=0 + import ROOT + from root_numpy import tree2array, root2array + if self.remove: + for fname in allsourcefiles: + fileTimeOut(fname, 120) + nparray = root2array( + fname, + treename = "deepntuplizer/tree", + stop = None, + branches = branches + ) + weighter.addDistributions(nparray) + #del nparray + counter=counter+1 + weighter.createRemoveProbabilitiesAndWeights(self.referenceclass) + + print("calculate means") + from DeepJetCore.preprocessing import meanNormProd + nparray = self.readTreeFromRootToTuple(allsourcefiles,branches=self.vtx_branches+self.eta_rel_branches+self.track_branches+self.global_branches, limit=500000) + for a in (self.vtx_branches+self.eta_rel_branches+self.track_branches+self.global_branches): + for b in range(len(nparray[a])): + nparray[a][b] = np.where(nparray[a][b] < 100000.0, nparray[a][b], 0) + means = np.array([],dtype='float32') + if len(nparray): + means = meanNormProd(nparray) + return {'weigther':weighter,'means':means} + + def convertFromSourceFile(self, filename, weighterobjects, istraining): + + # Function to produce the numpy training arrays from root files + + from DeepJetCore.Weighter import Weighter + from DeepJetCore.stopwatch import stopwatch + sw=stopwatch() + swall=stopwatch() + print(weighterobjects) + if not istraining: + self.remove = False + + def reduceTruth(uproot_arrays): + + b = uproot_arrays[b'isB'] + + bb = uproot_arrays[b'isBB'] + gbb = uproot_arrays[b'isGBB'] + + bl = uproot_arrays[b'isLeptonicB'] + blc = uproot_arrays[b'isLeptonicB_C'] + lepb = bl+blc + + c = uproot_arrays[b'isC'] + cc = uproot_arrays[b'isCC'] + gcc = uproot_arrays[b'isGCC'] + + ud = uproot_arrays[b'isUD'] + s = uproot_arrays[b'isS'] + uds = ud+s + + g = uproot_arrays[b'isG'] + + return np.vstack((b+lepb,bb+gbb,c+cc+gcc,uds+g)).transpose() + + print('reading '+filename) + + import ROOT + from root_numpy import tree2array, root2array + fileTimeOut(filename,120) #give eos a minute to recover + rfile = ROOT.TFile(filename) + tree = rfile.Get("deepntuplizer/tree") + self.nsamples = tree.GetEntries() + # user code, example works with the example 2D images in root format generated by make_example_data + from DeepJetCore.preprocessing import MeanNormZeroPad,MeanNormZeroPadParticles + x_global = MeanNormZeroPad(filename,weighterobjects['means'], + [self.global_branches,self.track_branches,self.eta_rel_branches,self.vtx_branches], + [1,self.n_track,self.n_eta_rel,self.n_vtx],self.nsamples) + + import uproot + urfile = uproot.open(filename)["deepntuplizer/tree"] + truth_arrays = urfile.arrays(self.truth_branches) + truth = reduceTruth(truth_arrays) + truth = truth.astype(dtype='float32', order='C') #important, float32 and C-type! + + x_global = x_global.astype(dtype='float32', order='C') + + + if self.remove: + b = [self.weightbranchX,self.weightbranchY] + b.extend(self.truth_branches) + b.extend(self.undefTruth) + fileTimeOut(filename, 120) + for_remove = root2array( + filename, + treename = "deepntuplizer/tree", + stop = None, + branches = b + ) + notremoves=weighterobjects['weigther'].createNotRemoveIndices(for_remove) + undef=for_remove['isUndefined'] + notremoves-=undef + print('took ', sw.getAndReset(), ' to create remove indices') + + + if self.remove: + print('remove') + x_global=x_global[notremoves > 0] + truth=truth[notremoves > 0] + + newnsamp=x_global.shape[0] + print('reduced content to ', int(float(newnsamp)/float(self.nsamples)*100),'%') + + + print('remove nans') + x_global = np.where(np.isfinite(x_global), x_global, 0) + + return [x_global], [truth], [] + + ## defines how to write out the prediction + def writeOutPrediction(self, predicted, features, truth, weights, outfilename, inputfile): + # predicted will be a list + + from root_numpy import array2root + out = np.core.records.fromarrays(np.vstack( (predicted[0].transpose(),truth[0].transpose(), features[0][:,0:2].transpose() ) ), + names='prob_isB, prob_isBB, prob_isC,prob_isUDSG,isB, isBB, isC,isUDSG,jet_pt, jet_eta') + array2root(out, outfilename, 'tree') diff --git a/modules/models/convolutional.py b/modules/models/convolutional.py index e89ee0f..84e60cf 100644 --- a/modules/models/convolutional.py +++ b/modules/models/convolutional.py @@ -56,3 +56,31 @@ def model_deepFlavourReference(Inputs,dropoutRate=0.1,momentum=0.6): return model +def model_deepCSV(Inputs,dropoutRate=0.1): + """ + reference 1x1 convolutional model for 'deepFlavour' + with recurrent layers and batch normalisation + standard dropout rate it 0.1 + should be trained for flavour prediction first. afterwards, all layers can be fixed + that do not include 'regression' and the training can be repeated focusing on the regression part + (check function fixLayersContaining with invert=True) + """ + globalvars = Inputs[0] + + x = Dense(100, activation='relu',kernel_initializer='lecun_uniform')(Inputs[0]) + x = Dropout(dropoutRate)(x) + x = Dense(100, activation='relu',kernel_initializer='lecun_uniform')(x) + x = Dropout(dropoutRate)(x) + x = Dense(100, activation='relu',kernel_initializer='lecun_uniform')(x) + x = Dropout(dropoutRate)(x) + x = Dense(100, activation='relu',kernel_initializer='lecun_uniform')(x) + x = Dropout(dropoutRate)(x) + x = Dense(100, activation='relu',kernel_initializer='lecun_uniform')(x) + + predictions = Dense(4, activation='softmax',kernel_initializer='lecun_uniform')(x) + + model = Model(inputs=Inputs, outputs=predictions) + return model + + + diff --git a/modules/scripts/plot_roc.py b/modules/scripts/plot_roc.py new file mode 100644 index 0000000..c40d1b2 --- /dev/null +++ b/modules/scripts/plot_roc.py @@ -0,0 +1,84 @@ +print("start import") +import os +import matplotlib +matplotlib.use('Agg') +from sklearn.metrics import roc_curve, roc_auc_score +from scipy.interpolate import InterpolatedUnivariateSpline +from pdb import set_trace +from itertools import chain +import matplotlib.pyplot as plt +import pandas as pd +import numpy as np +import root_numpy +import ROOT +from ROOT import TCanvas, TGraph, TGraphAsymmErrors, TH2F, TH1F +print("finish import") +from root_numpy import fill_hist + + +def spit_out_roc(disc,truth_array,selection_array): + + newx = np.logspace(-3.5, 0, 100) + tprs = pd.DataFrame() + truth = truth_array[selection_array]*1 + disc = disc[selection_array] + tmp_fpr, tmp_tpr, _ = roc_curve(truth, disc) + coords = pd.DataFrame() + coords['fpr'] = tmp_fpr + coords['tpr'] = tmp_tpr + clean = coords.drop_duplicates(subset=['fpr']) + spline = InterpolatedUnivariateSpline(clean.fpr, clean.tpr,k=1) + tprs = spline(newx) + return tprs, newx + + + +pred = [] +isDeepJet = False +if isDeepJet: + listbranch = ['prob_isB', 'prob_isBB','prob_isLeptB', 'prob_isC','prob_isUDS','prob_isG','isB', 'isBB', 'isLeptB', 'isC','isUDS','isG','jet_pt', 'jet_eta'] +else: + listbranch = ['prob_isB', 'prob_isBB', 'prob_isC','prob_isUDSG','isB', 'isBB', 'isC','isUDSG','jet_pt', 'jet_eta'] + +dirz = '/data/ml/ebols/DeepCSV_PredictionsTest_2/' +truthfile = open( dirz+'outfiles.txt','r') +print("opened text file") +rfile1 = ROOT.TChain("tree") +count = 0 + +for line in truthfile: + count += 1 + if len(line) < 1: continue + file1name=str(dirz+line.split('\n')[0]) + rfile1.Add(file1name) + +print("added files") +df = root_numpy.tree2array(rfile1, branches = listbranch) +print("converted to root") + +if isDeepJet: + b_jets = df['isB']+df['isBB']+df['isLeptB'] + disc = df['prob_isB']+df['prob_isBB']+df['prob_isLeptB'] + summed_truth = df['isB']+df['isBB']+df['isLeptB']+df['isC']+df['isUDS']+df['isG'] + veto_c = (df['isC'] != 1) & ( df['jet_pt'] > 30) & (summed_truth != 0) + veto_udsg = (df['isUDS'] != 1) & (df['isG'] != 1) & ( df['jet_pt'] > 30) & (summed_truth != 0) +else: + b_jets = df['isB']+df['isBB'] + disc = df['prob_isB']+df['prob_isBB'] + summed_truth = df['isB']+df['isBB']+df['isC']+df['isUDSG'] + veto_c = (df['isC'] != 1) & (summed_truth != 0) + veto_udsg = (df['isUDSG'] != 1) & (summed_truth != 0) + + +f = ROOT.TFile("ROCS_DeepCSV.root", "recreate") + +x1, y1 = spit_out_roc(disc,b_jets,veto_c) +x2, y2 = spit_out_roc(disc,b_jets,veto_udsg) + +gr1 = TGraph( 100, x1, y1 ) +gr1.SetName("roccurve_0") +gr2 = TGraph( 100, x2, y2 ) +gr2.SetName("roccurve_1") +gr1.Write() +gr2.Write() +f.Write()