Skip to content
This repository was archived by the owner on Jun 22, 2022. It is now read-only.

Commit cd74e65

Browse files
Kamil A. Kaczmarekjakubczakon
Kamil A. Kaczmarek
authored andcommitted
Dev corrections (#37)
* single neptune context * bug fix -> row as type float * imports optimized * corrected metric name * step names' corrections * added info that kaggle submit successful
1 parent f034606 commit cd74e65

7 files changed

+91
-70
lines changed

Diff for: src/feature_extraction.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def _add_prefix(self, columns, bucket_id):
109109

110110

111111
def aggregate_row(row):
112-
non_zero_values = row.iloc[row.nonzero()]
112+
non_zero_values = row.iloc[row.nonzero()].astype(np.float)
113113
if non_zero_values.empty:
114114
aggregations = {'non_zero_mean': np.nan,
115115
'non_zero_std': np.nan,

Diff for: src/hyperparameter_tuning.py

+6-7
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
import gc
22

33
import numpy as np
4-
from deepsense import neptune
54
from sklearn.externals import joblib
65
from steppy.base import BaseTransformer
76
from steppy.utils import get_logger
87

9-
from .utils import set_seed
8+
from .utils import set_seed, NeptuneContext
109

1110
logger = get_logger()
1211

@@ -132,14 +131,14 @@ def on_search_end(self, results):
132131
class NeptuneMonitor(GridSearchCallback):
133132
def __init__(self, name):
134133
self.name = name
135-
self.ctx = neptune.Context()
134+
self.neptune_ctx = NeptuneContext()
136135
self.highest_params_channel = self._create_text_channel(name='highest params')
137136
self.lowest_params_channel = self._create_text_channel(name='lowest params')
138137
self.run_params_channel = self._create_text_channel(name='run params')
139138
self.run_id = 0
140139

141140
def on_run_end(self, score, params):
142-
self.ctx.channel_send('score on run', x=self.run_id, y=score)
141+
self.neptune_ctx.ctx.channel_send('score on run', x=self.run_id, y=score)
143142
self.run_params_channel.send(y=params)
144143
self.run_id += 1
145144

@@ -148,14 +147,14 @@ def on_search_end(self, results):
148147
highest_score, highest_param_set = results_sorted[-1]
149148
lowest_score, lowest_param_set = results_sorted[0]
150149

151-
self.ctx.channel_send('highest score', x=0, y=highest_score)
152-
self.ctx.channel_send('lowest score', x=0, y=lowest_score)
150+
self.neptune_ctx.ctx.channel_send('highest score', x=0, y=highest_score)
151+
self.neptune_ctx.ctx.channel_send('lowest score', x=0, y=lowest_score)
153152

154153
self.highest_params_channel.send(y=highest_param_set)
155154
self.lowest_params_channel.send(y=lowest_param_set)
156155

157156
def _create_text_channel(self, name=''):
158-
return self.ctx.create_channel(name=name, channel_type=neptune.ChannelType.TEXT)
157+
return self.neptune_ctx.ctx.create_channel(name=name, channel_type=self.neptune_ctx.text_channel)
159158

160159

161160
class PersistResults(GridSearchCallback):

Diff for: src/models.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,13 @@
22
import numpy as np
33
import pandas as pd
44
from attrdict import AttrDict
5-
from deepsense import neptune
65
from sklearn.externals import joblib
76
from steppy.base import BaseTransformer
87

9-
from .utils import get_logger
8+
from .utils import NeptuneContext, get_logger
109

10+
neptune_ctx = NeptuneContext()
1111
logger = get_logger()
12-
ctx = neptune.Context()
1312

1413

1514
class LightGBM(BaseTransformer):
@@ -130,6 +129,5 @@ def callback(env):
130129
channel_name = '{}_{}_{}'.format(channel_prefix, name, loss_name)
131130
else:
132131
channel_name = '{}_{}'.format(name, loss_name)
133-
ctx.channel_send(channel_name, x=env.iteration, y=loss_value)
134-
132+
neptune_ctx.ctx.channel_send(channel_name, x=env.iteration, y=loss_value)
135133
return callback

Diff for: src/pipeline_blocks.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -283,18 +283,20 @@ def _join_features(numerical_features,
283283
def _get_feature_projectors(config):
284284
feature_projectors = []
285285
if config.truncated_svd.use:
286-
feature_projectors.append((TruncatedSVD, config.truncated_svd.params, 'trunc_svd'))
286+
feature_projectors.append((TruncatedSVD, config.truncated_svd.params, 'truncated svd'))
287287
if config.pca.use:
288288
feature_projectors.append((fe.PCA, config.pca.params, 'pca'))
289289
if config.fast_ica.use:
290-
feature_projectors.append((fe.FastICA, config.fast_ica.params, 'fast_ica'))
290+
feature_projectors.append((fe.FastICA, config.fast_ica.params, 'fast ica'))
291291
if config.factor_analysis.use:
292-
feature_projectors.append((fe.FactorAnalysis, config.factor_analysis.params, 'factor_analysis'))
292+
feature_projectors.append((fe.FactorAnalysis, config.factor_analysis.params, 'factor analysis'))
293293
if config.gaussian_random_projection.use:
294294
feature_projectors.append(
295-
(fe.GaussianRandomProjection, config.gaussian_random_projection.params, 'grp'))
295+
(fe.GaussianRandomProjection, config.gaussian_random_projection.params, 'gaussian random projection'))
296296
if config.sparse_random_projection.use:
297-
feature_projectors.append((fe.SparseRandomProjection, config.sparse_random_projection.params, 'srp'))
297+
feature_projectors.append((fe.SparseRandomProjection,
298+
config.sparse_random_projection.params,
299+
'sparse random projection'))
298300
return feature_projectors
299301

300302

Diff for: src/pipeline_config.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
import os
22

33
from attrdict import AttrDict
4-
from deepsense import neptune
54

6-
from .utils import read_params, parameter_eval
5+
from .utils import NeptuneContext, parameter_eval
76

8-
ctx = neptune.Context()
9-
params = read_params(ctx, fallback_file='neptune.yaml')
7+
neptune_ctx = NeptuneContext()
8+
params = neptune_ctx.params
109

1110
RANDOM_SEED = 90210
1211
DEV_SAMPLE_SIZE = 500

Diff for: src/pipeline_manager.py

+29-25
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,24 @@
33

44
import numpy as np
55
import pandas as pd
6-
from deepsense import neptune
76
from scipy.stats import gmean
87
from sklearn.model_selection import train_test_split
98

109
from . import pipeline_config as cfg
1110
from .pipelines import PIPELINES
12-
from .utils import init_logger, read_params, set_seed, create_submission, verify_submission, \
13-
log_root_mean_squared_error, KFoldByTargetValue
11+
from .utils import init_logger, NeptuneContext, set_seed, \
12+
create_submission, verify_submission, \
13+
root_mean_squared_log_error, KFoldByTargetValue
14+
15+
neptune_ctx = NeptuneContext()
16+
params = neptune_ctx.params
17+
ctx = neptune_ctx.ctx
1418

1519
set_seed(cfg.RANDOM_SEED)
1620
logger = init_logger()
17-
ctx = neptune.Context()
18-
params = read_params(ctx, fallback_file='neptune.yaml')
1921

2022

21-
class PipelineManager():
23+
class PipelineManager:
2224
def train(self, pipeline_name, dev_mode):
2325
train(pipeline_name, dev_mode)
2426

@@ -105,10 +107,10 @@ def evaluate(pipeline_name, dev_mode):
105107

106108
y_pred = output['prediction']
107109

108-
logger.info('Calculating LRMSE on validation set')
109-
score = log_root_mean_squared_error(y_true, y_pred)
110-
logger.info('LRMSE score on validation is {}'.format(score))
111-
ctx.channel_send('LRMSE', 0, score)
110+
logger.info('Calculating RMSLE on validation set')
111+
score = root_mean_squared_log_error(y_true, y_pred)
112+
logger.info('RMSLE score on validation is {}'.format(score))
113+
ctx.channel_send('RMSLE', 0, score)
112114

113115

114116
def predict(pipeline_name, dev_mode, submit_predictions):
@@ -149,9 +151,11 @@ def predict(pipeline_name, dev_mode, submit_predictions):
149151

150152

151153
def make_submission(submission_filepath):
152-
logger.info('making Kaggle submit...')
153-
os.system('kaggle competitions submit -c santander-value-prediction-challenge -f {} -m {}'
154-
.format(submission_filepath, params.kaggle_message))
154+
logger.info('Making Kaggle submit...')
155+
os.system('kaggle competitions submit -c santander-value-prediction-challenge -f {} -m {}'.format(
156+
submission_filepath,
157+
params.kaggle_message))
158+
logger.info('Kaggle submit completed')
155159

156160

157161
def train_evaluate_cv(pipeline_name, dev_mode):
@@ -161,7 +165,7 @@ def train_evaluate_cv(pipeline_name, dev_mode):
161165

162166
logger.info('Reading data...')
163167
if dev_mode:
164-
logger.info('running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE))
168+
logger.info('Running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE))
165169
train = pd.read_csv(params.train_filepath, nrows=cfg.DEV_SAMPLE_SIZE)
166170
else:
167171
train = pd.read_csv(params.train_filepath)
@@ -181,16 +185,16 @@ def train_evaluate_cv(pipeline_name, dev_mode):
181185

182186
score, _, _ = _fold_fit_evaluate_loop(train_data_split, valid_data_split, fold_id, pipeline_name)
183187

184-
logger.info('Fold {} LRMSE {}'.format(fold_id, score))
185-
ctx.channel_send('Fold {} LRMSE'.format(fold_id), 0, score)
188+
logger.info('Fold {} RMSLE {}'.format(fold_id, score))
189+
ctx.channel_send('Fold {} RMSLE'.format(fold_id), 0, score)
186190

187191
fold_scores.append(score)
188192

189193
score_mean, score_std = np.mean(fold_scores), np.std(fold_scores)
190194

191-
logger.info('LRMSE mean {}, LRMSE std {}'.format(score_mean, score_std))
192-
ctx.channel_send('LRMSE', 0, score_mean)
193-
ctx.channel_send('LRMSE STD', 0, score_std)
195+
logger.info('RMSLE mean {}, RMSLE std {}'.format(score_mean, score_std))
196+
ctx.channel_send('RMSLE', 0, score_mean)
197+
ctx.channel_send('RMSLE STD', 0, score_std)
194198

195199

196200
def train_evaluate_predict_cv(pipeline_name, dev_mode, submit_predictions):
@@ -224,8 +228,8 @@ def train_evaluate_predict_cv(pipeline_name, dev_mode, submit_predictions):
224228
valid_data_split, test,
225229
fold_id, pipeline_name)
226230

227-
logger.info('Fold {} LRMSE {}'.format(fold_id, score))
228-
ctx.channel_send('Fold {} LRMSE'.format(fold_id), 0, score)
231+
logger.info('Fold {} RMSLE {}'.format(fold_id, score))
232+
ctx.channel_send('Fold {} RMSLE'.format(fold_id), 0, score)
229233

230234
out_of_fold_train_predictions.append(out_of_fold_prediction)
231235
out_of_fold_test_predictions.append(test_prediction)
@@ -237,9 +241,9 @@ def train_evaluate_predict_cv(pipeline_name, dev_mode, submit_predictions):
237241
test_prediction_aggregated = _aggregate_test_prediction(out_of_fold_test_predictions)
238242
score_mean, score_std = np.mean(fold_scores), np.std(fold_scores)
239243

240-
logger.info('LRMSE mean {}, LRMSE std {}'.format(score_mean, score_std))
241-
ctx.channel_send('LRMSE', 0, score_mean)
242-
ctx.channel_send('LRMSE STD', 0, score_std)
244+
logger.info('RMSLE mean {}, RMSLE std {}'.format(score_mean, score_std))
245+
ctx.channel_send('RMSLE', 0, score_mean)
246+
ctx.channel_send('RMSLE STD', 0, score_std)
243247

244248
logger.info('Saving predictions')
245249
out_of_fold_train_predictions.to_csv(os.path.join(params.experiment_directory,
@@ -317,7 +321,7 @@ def _fold_fit_evaluate_loop(train_data_split, valid_data_split, fold_id, pipelin
317321

318322
y_valid_pred = output_valid['prediction']
319323
y_valid_true = valid_data_split[cfg.TARGET_COLUMN].values
320-
score = log_root_mean_squared_error(y_valid_true, y_valid_pred)
324+
score = root_mean_squared_log_error(y_valid_true, y_valid_pred)
321325

322326
return score, y_valid_pred, pipeline
323327

Diff for: src/utils.py

+42-23
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,52 @@
77
import pandas as pd
88
import yaml
99
from attrdict import AttrDict
10+
from deepsense import neptune
1011
from sklearn.metrics import mean_squared_error
1112
from sklearn.model_selection import BaseCrossValidator
1213
from steppy.base import BaseTransformer
1314

1415

16+
# Alex Martelli's 'Borg'
17+
# http://python-3-patterns-idioms-test.readthedocs.io/en/latest/Singleton.html
18+
class _Borg:
19+
_shared_state = {}
20+
21+
def __init__(self):
22+
self.__dict__ = self._shared_state
23+
24+
25+
class NeptuneContext(_Borg):
26+
def __init__(self, fallback_file='neptune_local.yaml'):
27+
_Borg.__init__(self)
28+
29+
self.ctx = neptune.Context()
30+
self.fallback_file = fallback_file
31+
self.params = self._read_params()
32+
self.numeric_channel = neptune.ChannelType.NUMERIC
33+
self.image_channel = neptune.ChannelType.IMAGE
34+
self.text_channel = neptune.ChannelType.TEXT
35+
36+
def _read_params(self):
37+
if self.ctx.params.__class__.__name__ == 'OfflineContextParams':
38+
params = self._read_yaml().parameters
39+
else:
40+
params = self.ctx.params
41+
return params
42+
43+
def _read_yaml(self):
44+
with open(self.fallback_file) as f:
45+
config = yaml.load(f)
46+
return AttrDict(config)
47+
48+
49+
def parameter_eval(param):
50+
try:
51+
return eval(param)
52+
except Exception:
53+
return param
54+
55+
1556
def create_submission(meta, predictions):
1657
submission = pd.DataFrame({'ID': meta['ID'].tolist(),
1758
'target': predictions
@@ -50,28 +91,6 @@ def init_logger():
5091
return logger
5192

5293

53-
def read_params(ctx, fallback_file):
54-
if ctx.params.__class__.__name__ == 'OfflineContextParams':
55-
neptune_config = read_yaml(fallback_file)
56-
params = neptune_config.parameters
57-
else:
58-
params = ctx.params
59-
return params
60-
61-
62-
def read_yaml(filepath):
63-
with open(filepath) as f:
64-
config = yaml.load(f)
65-
return AttrDict(config)
66-
67-
68-
def parameter_eval(param):
69-
try:
70-
return eval(param)
71-
except Exception:
72-
return param
73-
74-
7594
def persist_evaluation_predictions(experiment_directory, y_pred, raw_data, id_column, target_column):
7695
raw_data.loc[:, 'y_pred'] = y_pred.reshape(-1)
7796
predictions_df = raw_data.loc[:, [id_column, target_column, 'y_pred']]
@@ -103,7 +122,7 @@ def root_mean_squared_error(y_true, y_pred):
103122
return np.sqrt(mean_squared_error(y_true, y_pred))
104123

105124

106-
def log_root_mean_squared_error(y_true, y_pred):
125+
def root_mean_squared_log_error(y_true, y_pred):
107126
return np.sqrt(mean_squared_error(np.log(1 + y_true), np.log(1 + y_pred)))
108127

109128

0 commit comments

Comments
 (0)