-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathforest_predict.py
129 lines (107 loc) · 5.59 KB
/
forest_predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python3
# author : Pavel Polishchuk
# license : BSD-3
#==============================================================================
__author__ = 'Pavel Polishchuk'
import pandas as pd
import pickle
import argparse
import os
import gzip
from multiprocessing import Pool, cpu_count
from forest import predict_tree
def predict_tree_mp(items):
return predict_tree(*items)
def supply_data(model, x):
for tree in model:
yield tree, x
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Make prediction with Random Forest model.')
parser.add_argument('-x', metavar='descriptors.txt', required=True,
help='text file with descriptors (tab-separated).'
'Header is present. The first column contains compound names.')
parser.add_argument('-m', '--model', metavar='model.pkl', required=True,
help='file with a pickled model.')
parser.add_argument('-p', '--prediction', metavar='predictions.txt', required=False, default=None,
help='text file with predicted values. Default: None.')
parser.add_argument('-o', '--oob', metavar='oob_predictions.txt', required=False, default=None,
help='text file with predicted values for the out-of-bag set. '
'X values for the training set of the model '
'should be supplied to get correct results. For calculation of OOB statistics a model '
'containing full information should be supplied (not cleaned with clean_forest.py '
'script). Default: None.')
parser.add_argument('-u', '--cumulative', action='store_true', default=False,
help='to make cumulative predictions: the first column is predictions for the first tree, '
'the second one - the first two tress, and so on. This option is only needed if one wants '
'to track changes in accuracy predictions with increasing number of trees in the model.')
parser.add_argument('-s', '--sd', action='store_true', default=False,
help='set this argument to calculate standard deviation among predictions of individual trees '
'which can be used as a measure of applicability domain. If -u argument was set the '
'sd argument will be ignored.')
parser.add_argument('-c', '--ncpu', metavar='NUMBER', required=False, default=1,
help='number of CPU to use. Default: 1.')
args = vars(parser.parse_args())
for o, v in args.items():
if o == "x": x_fname = v
if o == "model": model_fname = v
if o == "prediction": pred_fname = v
if o == "oob": oob_fname = v
if o == "cumulative": cumulative = v
if o == "ncpu": ncpu = int(v)
if o == "sd": sd = v
if pred_fname is None and oob_fname is None:
raise ValueError('at least one of outputs should be specified: prediction for the whole set or for oob.')
if pred_fname is not None and os.path.isfile(pred_fname):
os.remove(pred_fname)
if oob_fname is not None and os.path.isfile(oob_fname):
os.remove(oob_fname)
if cumulative:
sd = False
pool = Pool(min(ncpu, cpu_count())) if ncpu > 1 else None
if model_fname.endswith('.gz'):
model = pickle.load(gzip.open(model_fname))
else:
model = pickle.load(open(model_fname, 'rb'))
for x in pd.read_table(x_fname, sep="\t", index_col=0, chunksize=100000):
if pool is not None:
pred = list(pool.imap(predict_tree_mp, supply_data(model, x)))
else:
pred = []
for tree in model:
pred.append(predict_tree(tree, x))
pred = pd.concat(pred, axis=1)
pred.columns = list(range(pred.shape[1]))
if pred_fname:
if cumulative:
pred_cum = pred.cumsum(1).divide(pd.Series(list(range(1, pred.shape[1] + 1))))
pred_cum.columns = list(range(1, pred_cum.shape[1] + 1))
pred_result = pred_cum.round(3)
else:
tmp = pred.mean(axis=1).round(3).to_frame(name=len(model))
if not sd:
pred_result = tmp
else:
tmp_sd = pred.std(axis=1).round(3).to_frame(name='sd')
pred_result = pd.concat([tmp, tmp_sd], axis=1)
if os.path.isfile(pred_fname):
pred_result.to_csv(pred_fname, sep='\t', mode='a', header=False)
else:
pred_result.to_csv(pred_fname, sep='\t')
if oob_fname:
for i in range(len(model)):
pred.loc[pred.index.isin(model[i].node[-1]['mol_names']), i] = None
if cumulative:
pred_cum = pred.cumsum(1).divide(pd.Series(list(range(1, pred.shape[1] + 1))))
pred_cum.columns = list(range(1, pred_cum.shape[1] + 1))
oob_result = pred_cum.round(3).fillna(axis=1, method='ffill')
else:
tmp = pred.mean(axis=1).round(3).to_frame(name=len(model))
if not sd:
oob_result = tmp
else:
tmp_sd = pred.std(axis=1).round(3).to_frame(name='sd')
oob_result = pd.concat([tmp, tmp_sd], axis=1)
if os.path.isfile(oob_fname):
oob_result.to_csv(oob_fname, sep='\t', mode='a', header=False)
else:
oob_result.to_csv(oob_fname, sep='\t')