Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix feeddown folding #945

Draft
wants to merge 5 commits into
base: run3
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .envrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
if [[ `hostname` = "alicecerno2" ]] && [[ -z ${ROOT_BASEDIR} ]]; then

PYTHON_VERSION=3.10.14
ROOT_VERSION=v6-32-04
ROOUNFOLD_VERSION=2.0.1

PREFIX=/home/pyadmin/software_mlhep
layout python ${PREFIX}/install/pyenv/versions/${PYTHON_VERSION}/bin/python3
path_add PYTHONPATH ${PREFIX}/install/root-${ROOT_VERSION}_py-${PYTHON_VERSION}/lib
PATH_add ${PREFIX}/install/root-${ROOT_VERSION}_py-${PYTHON_VERSION}/bin
path_add LD_LIBRARY_PATH ${PREFIX}/install/root-${ROOT_VERSION}_py-${PYTHON_VERSION}/lib
# path_add LD_LIBRARY_PATH ${PREFIX}/install/RooUnfold-${ROOUNFOLD_VERSION}_root-${ROOT_VERSION}_py-${PYTHON_VERSION}/lib
path_add LD_LIBRARY_PATH ${PREFIX}/build/RooUnfold-${ROOUNFOLD_VERSION}_root-${ROOT_VERSION}_py-${PYTHON_VERSION}

fi
30 changes: 23 additions & 7 deletions machine_learning_hep/analysis/analyzer_jets.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def calculate_efficiencies(self):
# gen-level efficiency for feeddown estimation
h_eff_gen = h_genmatch[cat].Clone()
h_eff_gen.Divide(h_gen[cat])
self._save_hist(h_eff_gen, f'eff/h_effgen_{cat}.png')
self._save_hist(h_eff_gen, f'eff/h_effgen_{cat}.png', 'text')
self.hcandeff_gen[cat] = h_eff_gen

# matching loss
Expand Down Expand Up @@ -854,6 +854,7 @@ def _analyze(self, method = 'sidesub'):
self.logger.debug("Final histogram: %s, jet pT %g to %g",
var, range_ptjet[0], range_ptjet[1])
# self.logger.debug(print_histogram(hproj_sel))
ROOT.gStyle.SetOptStat(0)
self._save_hist(
hproj_sel,
f'uf/h_{var}_{method}_unfolded_{mcordata}_' +
Expand Down Expand Up @@ -962,29 +963,36 @@ def estimate_feeddown(self):
f';p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{var}',
bins_ptjet, self.bins_candpt, bins_obs[var])
fill_hist_fast(h3_fd_gen_orig, df[['pt_jet', 'pt_cand', f'{colname}']])
self._save_hist(project_hist(h3_fd_gen_orig, [0, 2], {}), f'fd/h_ptjet-{var}_feeddown_gen_noeffscaling.png')

# new method
h3_fd_gen = h3_fd_gen_orig.Clone()
ensure_sumw2(h3_fd_gen)
self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_gen.png')
# apply np efficiency
for ipt in range(get_nbins(h3_fd_gen, 1)):
eff_np = self.hcandeff_gen['np'].GetBinContent(ipt+1)
for iptjet, ishape in itertools.product(
range(get_nbins(h3_fd_gen, 0)), range(get_nbins(h3_fd_gen, 2))):
scale_bin(h3_fd_gen, eff_np, iptjet+1, ipt+1, ishape+1)
for iptjet in range(get_nbins(h3_fd_gen, 0)):
eff_np = self.hcandeff_gen['np'].GetBinContent(iptjet + 1, ipt + 1)
print(f'scaling with {eff_np=} for {iptjet=}, {ipt=}', flush=True)
for ishape in range(get_nbins(h3_fd_gen, 2)):
scale_bin(h3_fd_gen, eff_np, iptjet+1, ipt+1, ishape+1)
self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_gen_geneff.png')

# 3d folding incl. kinematic efficiencies
with TFile(self.n_fileeff) as rfile:
h_effkine_gen = self._build_effkine(
rfile.Get(f'h_effkine_fd_gen_nocuts_{var}'),
rfile.Get(f'h_effkine_fd_gen_cut_{var}'))
self._save_hist(h_effkine_gen, f'fd/h_effkine-ptjet-{var}_fd_gen.png', 'text')
h_effkine_det = self._build_effkine(
rfile.Get(f'h_effkine_fd_det_nocuts_{var}'),
rfile.Get(f'h_effkine_fd_det_cut_{var}'))
self._save_hist(h_effkine_det, f'fd/h_effkine-ptjet-{var}_fd_det.png', 'text')
h_response = rfile.Get(f'h_response_fd_{var}')
self._save_hist(project_hist(h_response, [0, 3], {}), f'fd/h_response_ptjet_{var}.png')
self._save_hist(project_hist(h_response, [1, 4], {}), f'fd/h_response_pthf_{var}.png')
self._save_hist(project_hist(h_response, [2, 5], {}), f'fd/h_response_shape_{var}.png')
h_response.Print('all')
print(f'fd folding for {var=}')
h_response_norm = norm_response(h_response, 3)
h3_fd_gen.Multiply(h_effkine_gen)
self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_gen_genkine.png')
Expand All @@ -1002,14 +1010,15 @@ def estimate_feeddown(self):
for iptjet, ishape in itertools.product(
range(get_nbins(h3_fd_det, 0)), range(get_nbins(h3_fd_det, 2))):
scale_bin(h3_fd_det, 1./eff_pr, iptjet+1, ipt+1, ishape+1)
self._save_hist(project_hist(h3_fd_det, [0, 2], {}), f'fd/h_ptjet-{var}_fdnew_det_deteff.png')

# project to 2d (ptjet-shape)
h_fd_det = project_hist(h3_fd_det, [0, 2], {})
self._save_hist(h_fd_det, f'fd/h_ptjet-{var}_fdnew_det_deteff.png')

# old method
h3_fd_gen = h3_fd_gen_orig.Clone()
ensure_sumw2(h3_fd_gen)
self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f'fd/h_ptjet-{var}_feeddown_gen_noeffscaling.png')
for ipt in range(get_nbins(h3_fd_gen, 1)):
eff_pr = self.hcandeff['pr'].GetBinContent(ipt+1)
eff_np = self.hcandeff['np'].GetBinContent(ipt+1)
Expand Down Expand Up @@ -1051,6 +1060,13 @@ def estimate_feeddown(self):
hfeeddown_det.Divide(h_effkine_det)
self._save_hist(hfeeddown_det, f'fd/h_ptjet-{var}_feeddown_det_kineeffscaled.png')

c = TCanvas()
c.cd()
hc = project_hist(h_fd_det, [1], {0: (3, 3)}).DrawCopy()
hc.SetLineColor(ROOT.kViolet)
hc = project_hist(hfeeddown_det, [1], {0: (3, 3)}).DrawCopy('same')
self._save_canvas(c, f'fd/h_{var}_fdcmp.png')

if self.cfg('fd_folding_method') == '3d':
self.logger.info('using 3d folding for feeddown estimation for %s', var)
hfeeddown_det = h_fd_det
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -446,8 +446,8 @@ D0Jet_pp:
components:
sig:
fn: 'Gaussian::peak(m[1.,5.], mean[1.85,1.89], sigma_g1[.01,.08])'
bkg:
fn: 'Gaussian::wide(m, mean, sigma_wide[.05,1.])'
wide:
fn: 'Gaussian::wide(m, mean, expr("n*sigma_g1", n[1.,5.], sigma_g1))'
model:
fn: 'SUM::sig(frac_wide[0.,.3]*wide, peak)'
- level: mcrefl
Expand Down Expand Up @@ -530,49 +530,49 @@ D0Jet_pp:
- level: mc
ptrange: [1., 3.]
range: [1.69, 2.04]
fix_params: ['frac_l', 'mean_l', 'mean_r', 'sigma_l', 'sigma_r', 'frac_wide', 'sigma_g1', 'sigma_wide']
fix_params: ['frac_l', 'mean_l', 'mean_r', 'sigma_l', 'sigma_r', 'frac_wide', 'sigma_g1', 'n']
components:
model:
fn: 'SUM::sigrefl(frac_refl[0.,1.]*refl, sig)'
- level: mc
ptrange: [3., 4.]
range: [1.68, 2.06]
fix_params: ['frac_l', 'mean_l', 'mean_r', 'sigma_l', 'sigma_r', 'frac_wide', 'sigma_g1', 'sigma_wide']
fix_params: ['frac_l', 'mean_l', 'mean_r', 'sigma_l', 'sigma_r', 'frac_wide', 'sigma_g1', 'n']
components:
model:
fn: 'SUM::sigrefl(frac_refl[0.,1.]*refl, sig)'
- level: mc
ptrange: [4., 5.]
range: [1.64, 2.08]
fix_params: ['frac_l', 'mean_l', 'mean_r', 'sigma_l', 'sigma_r', 'frac_wide', 'sigma_g1', 'sigma_wide']
fix_params: ['frac_l', 'mean_l', 'mean_r', 'sigma_l', 'sigma_r', 'frac_wide', 'sigma_g1', 'n']
components:
model:
fn: 'SUM::sigrefl(frac_refl[0.,1.]*refl, sig)'
- level: mc
ptrange: [5., 6.]
range: [1.64, 2.10]
fix_params: ['frac_l', 'mean_l', 'mean_r', 'sigma_l', 'sigma_r', 'frac_wide', 'sigma_g1', 'sigma_wide']
fix_params: ['frac_l', 'mean_l', 'mean_r', 'sigma_l', 'sigma_r', 'frac_wide', 'sigma_g1', 'n']
components:
model:
fn: 'SUM::sigrefl(frac_refl[0.,1.]*refl, sig)'
- level: mc
ptrange: [6., 8.]
range: [1.60, 2.14]
fix_params: ['frac_l', 'mean_l', 'mean_r', 'sigma_l', 'sigma_r', 'frac_wide', 'sigma_g1', 'sigma_wide']
fix_params: ['frac_l', 'mean_l', 'mean_r', 'sigma_l', 'sigma_r', 'frac_wide', 'sigma_g1', 'n']
components:
model:
fn: 'SUM::sigrefl(frac_refl[0.,1.]*refl, sig)'
- level: mc
ptrange: [8., 12.]
range: [1.52, 2.30]
fix_params: ['frac_l', 'mean_l', 'mean_r', 'sigma_l', 'sigma_r', 'frac_wide', 'sigma_g1', 'sigma_wide']
fix_params: ['frac_l', 'mean_l', 'mean_r', 'sigma_l', 'sigma_r', 'frac_wide', 'sigma_g1', 'n']
components:
model:
fn: 'SUM::sigrefl(frac_refl[0.,1.]*refl, sig)'
- level: mc
ptrange: [12., 48.]
range: [1.40, 2.40]
fix_params: ['frac_l', 'mean_l', 'mean_r', 'sigma_l', 'sigma_r', 'frac_wide', 'sigma_g1', 'sigma_wide']
fix_params: ['frac_l', 'mean_l', 'mean_r', 'sigma_l', 'sigma_r', 'frac_wide', 'sigma_g1', 'n']
components:
model:
fn: 'SUM::sigrefl(frac_refl[0.,1.]*refl, sig)'
Expand Down Expand Up @@ -701,6 +701,7 @@ D0Jet_pp:
frac_mcana: .2 # fraction of MC sample for the closure
fd_root: '/data2/vkucera/powheg/trees_powheg_fd_central.root' # systematics
fd_parquet: '/data2/jklein/powheg/trees_powheg_fd_central.parquet' # systematics
fd_folding_method: 3d

# obsolete?
proc_type: Jets # used
Expand Down Expand Up @@ -803,7 +804,19 @@ D0Jet_pp:

# Additional cuts applied before mass histogram is filled
use_cuts: True # systematics
cuts: ["mlBkgScore < 0.02", "mlBkgScore < 0.02", "mlBkgScore < 0.02", "mlBkgScore < 0.05", "mlBkgScore < 0.06", "mlBkgScore < 0.08", "mlBkgScore < 0.08", "mlBkgScore < 0.10", "mlBkgScore < 0.10", "mlBkgScore < 0.20", "mlBkgScore < 0.25", "mlBkgScore < 0.30"] # (sel_an_binmin bins) systematics FIXME: Update for new model.
cuts:
- "mlBkgScore < 0.02"
- "mlBkgScore < 0.02"
- "mlBkgScore < 0.02"
- "mlBkgScore < 0.05"
- "mlBkgScore < 0.06"
- "mlBkgScore < 0.08"
- "mlBkgScore < 0.08"
- "mlBkgScore < 0.10"
- "mlBkgScore < 0.10"
- "mlBkgScore < 0.20"
- "mlBkgScore < 0.25"
- "mlBkgScore < 0.30"

systematics: # used in machine_learning_hep/analysis/systematics.py
probvariation:
Expand Down
22 changes: 16 additions & 6 deletions machine_learning_hep/processer_jet.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,17 @@ def _calculate_variables(self, df, verify=False): # pylint: disable=invalid-name
df['hfPx'] = df.fPt * np.cos(df.fPhi)
df['hfPy'] = df.fPt * np.sin(df.fPhi)
df['hfPz'] = df.fPt * np.sinh(df.fEta)
df['piPx'] = df.jetPx - df.hfPx
df['piPy'] = df.jetPy - df.hfPy
df['piPz'] = df.jetPz - df.hfPz
df['zpar_num'] = df.jetPx * df.hfPx + df.jetPy * df.hfPy + df.jetPz * df.hfPz
df['zpar_den'] = df.jetPx * df.jetPx + df.jetPy * df.jetPy + df.jetPz * df.jetPz
df['zpar'] = df.zpar_num / df.zpar_den
df[df['zpar'] >= 1.]['zpar'] = .999 # move 1 to last bin
df['nsub21'] = df.fNSub2 / df.fNSub1
df['E_D'] = np.sqrt(1.86**2 + df.hfPx**2 + df.hfPy**2 + df.hfPz**2)
df['E_pi'] = np.sqrt(.139**2 + df.piPx**2 + df.piPy**2 + df.piPz**2)
df['M_D_pi'] = np.sqrt((df.E_D + df.E_pi)**2 - df.jetPx**2 - df.jetPy**2 - df.jetPz**2)

self.logger.debug('zg')
df['zg_array'] = np.array(.5 - abs(df.fPtSubLeading / (df.fPtLeading + df.fPtSubLeading) - .5))
Expand Down Expand Up @@ -232,6 +238,8 @@ def process_histomass_single(self, index):
dfquery(df, 'idx_match >= 0', inplace=True)

self._calculate_variables(df)
# FIXME: suppress D*, move to DB
df = df[(abs(df.M_D_pi - 2.01) > .01) | (df.fJetNConstituents > 2)]

for obs, spec in self.cfg('observables', {}).items():
self.logger.debug('preparing histograms for %s', obs)
Expand Down Expand Up @@ -281,7 +289,8 @@ def process_efficiency_single(self, index):
h_response_fd = {var:
create_hist(
f'h_response_fd_{var}',
f";p_{{T}}^{{jet}} (GeV/#it{{c}});{var};p_{{T}}^{{jet}} (GeV/#it{{c}});{var};p_{{T}} (GeV/#it{{c}})",
f";p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{var};" +
f"p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{var}",
self.binarrays_ptjet['det'][var], self.binarrays_obs['det']['fPt'], self.binarrays_obs['det'][var],
self.binarrays_ptjet['gen'][var], self.binarrays_obs['gen']['fPt'], self.binarrays_obs['gen'][var])
for var in self.cfg('observables', []) if not '-' in var}
Expand Down Expand Up @@ -380,7 +389,8 @@ def process_efficiency_single(self, index):

if cat in dfmatch and dfmatch[cat] is not None:
self._prepare_response(dfmatch[cat], h_effkine, h_response, cat, var)
self._prepare_response_fd(dfmatch[cat], h_effkine_fd, h_response_fd, var)
if cat == 'np':
self._prepare_response_fd(dfmatch[cat], h_effkine_fd, h_response_fd, var)
f = self.cfg('frac_mcana', .2)
_, df_mccorr = self.split_df(dfmatch[cat], f if f < 1. else 0.)
self._prepare_response(df_mccorr, h_effkine_frac, h_response_frac, cat, var)
Expand Down Expand Up @@ -428,6 +438,8 @@ def _prepare_response_fd(self, dfi, h_effkine, h_response, var):
axis_var_gen = get_axis(h_response[var], 5)

df = dfi
fill_hist(h_response[var], df[['fJetPt', 'fPt', f'{var}', 'fJetPt_gen', 'fPt_gen', f'{var}_gen']])

# TODO: the first cut should be taken care of by under-/overflow bins, check their usage in analyzer
df = df.loc[(df.fJetPt >= axis_ptjet_det.GetXmin()) & (df.fJetPt < axis_ptjet_det.GetXmax()) &
(df.fPt >= axis_pthf_det.GetXmin()) & (df.fPt < axis_pthf_det.GetXmax()) &
Expand All @@ -438,14 +450,12 @@ def _prepare_response_fd(self, dfi, h_effkine, h_response, var):
(df[f'{var}_gen'] >= axis_var_gen.GetXmin()) & (df[f'{var}_gen'] < axis_var_gen.GetXmax())]
fill_hist(h_effkine[('det', 'cut', var)], df[['fJetPt', 'fPt', var]])

fill_hist(h_response[var], df[['fJetPt', 'fPt', f'{var}', 'fJetPt_gen', 'fPt_gen', f'{var}_gen']])

df = dfi
df = df.loc[(df.fJetPt_gen >= axis_ptjet_gen.GetXmin()) & (df.fJetPt_gen < axis_ptjet_gen.GetXmax()) &
(df.fPt_gen >= axis_pthf_gen.GetXmin()) & (df.fPt_gen < axis_pthf_gen.GetXmax()) &
(df[f'{var}_gen'] >= axis_var_gen.GetXmin()) & (df[f'{var}_gen'] < axis_var_gen.GetXmax())]
fill_hist(h_effkine[('gen', 'nocuts', var)], df[['fJetPt_gen', 'fPt', f'{var}_gen']])
fill_hist(h_effkine[('gen', 'nocuts', var)], df[['fJetPt_gen', 'fPt_gen', f'{var}_gen']])
df = df.loc[(df.fJetPt >= axis_ptjet_det.GetXmin()) & (df.fJetPt < axis_ptjet_det.GetXmax()) &
(df.fPt >= axis_pthf_det.GetXmin()) & (df.fPt < axis_pthf_det.GetXmax()) &
(df[f'{var}'] >= axis_var_det.GetXmin()) & (df[f'{var}'] < axis_var_det.GetXmax())]
fill_hist(h_effkine[('gen', 'cut', var)], df[['fJetPt_gen', 'fPt', f'{var}_gen']])
fill_hist(h_effkine[('gen', 'cut', var)], df[['fJetPt_gen', 'fPt_gen', f'{var}_gen']])
32 changes: 25 additions & 7 deletions machine_learning_hep/utils/hist.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,13 +293,19 @@ def norm_response(response, dim_out):
for bin_in in itertools.product(*(range(1, get_nbins(response_norm, iaxis) + 1)
for iaxis in range(dim_out, get_dim(response_norm)))):
for iaxis, val in enumerate(bin_in, dim_out):
get_axis(response_norm, iaxis).SetRange(val, val)
norm = response_norm.Projection(0).Integral()
get_axis(response, iaxis).SetRange(val, val)
# norm = response.Projection(0).Integral()
norm = 0.
for bin_out in itertools.product(*(range(1, get_nbins(response, i) + 1) for i in range(dim_out))):
norm += get_bin_val(response, bin_out + bin_in)
if np.isclose(norm, 0.):
continue
for bin_out in itertools.product(*(range(1, get_nbins(response_norm, i)+1) for i in range(dim_out))):
set_bin_val(response_norm, bin_out + bin_in, get_bin_val(response_norm, bin_out + bin_in) / norm)
set_bin_err(response_norm, bin_out + bin_in, get_bin_err(response_norm, bin_out + bin_in) / norm)
total = 0.
for bin_out in itertools.product(*(range(1, get_nbins(response_norm, i) + 1) for i in range(dim_out))):
set_bin_val(response_norm, bin_out + bin_in, get_bin_val(response, bin_out + bin_in) / norm)
set_bin_err(response_norm, bin_out + bin_in, get_bin_err(response, bin_out + bin_in) / norm)
total += get_bin_val(response_norm, bin_out + bin_in)
# print(f'distributed {bin_in=} to {total=} counts')
return response_norm


Expand All @@ -309,10 +315,22 @@ def fold_hist(hist, response):
dim_out = get_dim(response) - get_dim(hist)
axes_spec = list(np.array(get_axis(response, i).GetXbins(), 'd') for i in range(dim_out))
hfold = create_hist('test', 'test', *axes_spec)
for bin_out in itertools.product(*(range(1, get_nbins(hfold, i)+1) for i in range(get_dim(hfold)))):
# TODO: setup axes
for bin_in in itertools.product(*(range(1, get_nbins(hist, i) + 1) for i in range(get_dim(hist)))):
total = 0.
for bin_out in itertools.product(*(range(1, get_nbins(hfold, i) + 1) for i in range(get_dim(hfold)))):
total += get_bin_val(response, bin_out + bin_in)
# print(f'redistributed {bin_in=} to {total=} counts')

for bin_out in itertools.product(*(range(1, get_nbins(hfold, i) + 1) for i in range(get_dim(hfold)))):
val = 0.
err = 0.
for bin_in in itertools.product(*(range(1, get_nbins(hist, i)+1) for i in range(get_dim(hist)))):
for bin_in in itertools.product(*(range(1, get_nbins(hist, i) + 1) for i in range(get_dim(hist)))):
# if bin_in == bin_out:
# val = get_bin_val(hist, bin_in)
# err = get_bin_err(hist, bin_in)**2
# continue
# print(f'{bin_out=} collecting {bin_in=} with weight {get_bin_val(response, bin_out + bin_in)}')
val += get_bin_val(hist, bin_in) * get_bin_val(response, bin_out + bin_in)
err += get_bin_err(hist, bin_in)**2 * get_bin_val(response, bin_out + bin_in)**2
set_bin_val(hfold, bin_out, val)
Expand Down
Loading