-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvert_parquet_to_root.py
90 lines (75 loc) · 3.61 KB
/
convert_parquet_to_root.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import numpy
import awkward as ak
import uproot
import sys
import getpass
import argparse
parser = argparse.ArgumentParser()
usr=getpass.getuser()
parser.add_argument("--input", help = "input parquet file", type=str, required=True)
parser.add_argument("--out_dir", help = "output directory", type=str, default = f"/ceph/cms/store/user/{usr}/SRopt/")
parser.add_argument("--out_name", help = "output filename, without extension", type=str, default = None)
parser.add_argument("--slim", help = "output file contains minimal branches", action="store_true", default = True)
parser.add_argument("--verbose", help = "output file contains minimal branches", action="store_true", default = False)
args = parser.parse_args()
"""
Script to convert parquet files into TTrees so that it can be used with the SRs optimization script.
"""
def to_tensor(dataframe, columns = [], dtypes = {}):
# Use all columns from data frame if none where listed when called
if len(columns) <= 0:
columns = dataframe.fields
# Build list of dtypes to use, updating from any `dtypes` passed when called
dtype_list = []
for column in columns:
if column not in dtypes.keys():
if column == "proc" or column == "sample"or column =="run_era":
dtype_list.append("S15")
else: dtype_list.append(type(dataframe[column][0]))
else:
dtype_list.append(dtypes[column])
# Build dictionary with lists of column names and formatting in the same order
dtype_dict = {
'names': columns,
'formats': dtype_list
}
# Initialize _mostly_ empty numpy array with column names and formatting
numpy_buffer = numpy.zeros(
shape = len(dataframe),
dtype = dtype_dict)
# Insert values from dataframe columns into numpy labels
for col in columns:
if args.verbose:
print(f"column {col} input shape {numpy_buffer[col].shape} output shape {dataframe[col].to_numpy().shape}")
numpy_buffer[col] = dataframe[col].to_numpy()
# Return results of conversion
return numpy_buffer
events = ak.from_parquet(args.input)
# For the time being, we have to match processes into integers. Will need to modify the SR opt FIXME
proc_dict={
"Data_EraE": 0, "Data_EraF": 0, "Data_EraG": 0, "Data": 0,
"GluGluHToGG": 1, "ttHToGG": 2, "VBFHToGG": 3, "VHToGG": 4,
"GGJets": 5, "DDQCDGJets": 6,"DDBKG": 6,
"GluGluToHH": 8
}
if "proc" in events.fields: #SnT style parquet
events = ak.with_field(events, [ proc_dict.get(proc) for proc in events["proc"].to_list()] , "process_id")
needed_fields=["mass","weight","score_GluGluToHH","process_id"]
elif "sample" in events.fields: #NW style parquet
events = ak.with_field(events, [ proc_dict.get(proc) for proc in events["sample"].to_list()] , "process_id")
needed_fields=["pt","mass","weight_tot","signleH_dnn_new","ddbkg_dnn","process_id"]
else:
sys.exit("[Parquet2root] Parquet format not supported (no proc or sample field found)")
print ("[Parquet2root] Done with the id addition")
if args.slim: # Slimming by defaut. It's good for you + NW parquet conversion not fully working for literal fields FIXME
events = events[[f for f in events.fields if f in needed_fields]]
print("[Parquet2root] Done with the slimming")
#square down parquet dataframes before exporting to root
events = to_tensor(events)
if args.out_name is None:
output_root = args.out_dir + args.input.split("/")[-1].replace(".parquet", ".root")
else:
output_root = args.out_dir + args.out_name + ".root"
with uproot.recreate(output_root) as f:
f["t"] = events
print("[Parquet2root] All good")