Dev corrections (#37)

Kamil A. Kaczmarek · jakubczakon · commit cd74e655df2b · 2018-07-04T13:19:34.000+02:00
* single neptune context

* bug fix -&gt; row as type float

* imports optimized

* corrected metric name

* step names' corrections

* added info that kaggle submit successful
diff --git a/src/feature_extraction.py b/src/feature_extraction.py
@@ -109,7 +109,7 @@ def _add_prefix(self, columns, bucket_id):
 
 
 def aggregate_row(row):
-    non_zero_values = row.iloc[row.nonzero()]
+    non_zero_values = row.iloc[row.nonzero()].astype(np.float)
     if non_zero_values.empty:
         aggregations = {'non_zero_mean': np.nan,
                         'non_zero_std': np.nan,
diff --git a/src/hyperparameter_tuning.py b/src/hyperparameter_tuning.py
@@ -1,12 +1,11 @@
 import gc
 
 import numpy as np
-from deepsense import neptune
 from sklearn.externals import joblib
 from steppy.base import BaseTransformer
 from steppy.utils import get_logger
 
-from .utils import set_seed
+from .utils import set_seed, NeptuneContext
 
 logger = get_logger()
 
@@ -132,14 +131,14 @@ def on_search_end(self, results):
 class NeptuneMonitor(GridSearchCallback):
     def __init__(self, name):
         self.name = name
-        self.ctx = neptune.Context()
+        self.neptune_ctx = NeptuneContext()
         self.highest_params_channel = self._create_text_channel(name='highest params')
         self.lowest_params_channel = self._create_text_channel(name='lowest params')
         self.run_params_channel = self._create_text_channel(name='run params')
         self.run_id = 0
 
     def on_run_end(self, score, params):
-        self.ctx.channel_send('score on run', x=self.run_id, y=score)
+        self.neptune_ctx.ctx.channel_send('score on run', x=self.run_id, y=score)
         self.run_params_channel.send(y=params)
         self.run_id += 1
 
@@ -148,14 +147,14 @@ def on_search_end(self, results):
         highest_score, highest_param_set = results_sorted[-1]
         lowest_score, lowest_param_set = results_sorted[0]
 
-        self.ctx.channel_send('highest score', x=0, y=highest_score)
-        self.ctx.channel_send('lowest score', x=0, y=lowest_score)
+        self.neptune_ctx.ctx.channel_send('highest score', x=0, y=highest_score)
+        self.neptune_ctx.ctx.channel_send('lowest score', x=0, y=lowest_score)
 
         self.highest_params_channel.send(y=highest_param_set)
         self.lowest_params_channel.send(y=lowest_param_set)
 
     def _create_text_channel(self, name=''):
-        return self.ctx.create_channel(name=name, channel_type=neptune.ChannelType.TEXT)
+        return self.neptune_ctx.ctx.create_channel(name=name, channel_type=self.neptune_ctx.text_channel)
 
 
 class PersistResults(GridSearchCallback):
diff --git a/src/models.py b/src/models.py
@@ -2,14 +2,13 @@
 import numpy as np
 import pandas as pd
 from attrdict import AttrDict
-from deepsense import neptune
 from sklearn.externals import joblib
 from steppy.base import BaseTransformer
 
-from .utils import get_logger
+from .utils import NeptuneContext, get_logger
 
+neptune_ctx = NeptuneContext()
 logger = get_logger()
-ctx = neptune.Context()
 
 
 class LightGBM(BaseTransformer):
@@ -130,6 +129,5 @@ def callback(env):
                 channel_name = '{}_{}_{}'.format(channel_prefix, name, loss_name)
             else:
                 channel_name = '{}_{}'.format(name, loss_name)
-            ctx.channel_send(channel_name, x=env.iteration, y=loss_value)
-
+            neptune_ctx.ctx.channel_send(channel_name, x=env.iteration, y=loss_value)
     return callback
diff --git a/src/pipeline_blocks.py b/src/pipeline_blocks.py
@@ -283,18 +283,20 @@ def _join_features(numerical_features,
 def _get_feature_projectors(config):
     feature_projectors = []
     if config.truncated_svd.use:
-        feature_projectors.append((TruncatedSVD, config.truncated_svd.params, 'trunc_svd'))
+        feature_projectors.append((TruncatedSVD, config.truncated_svd.params, 'truncated svd'))
     if config.pca.use:
         feature_projectors.append((fe.PCA, config.pca.params, 'pca'))
     if config.fast_ica.use:
-        feature_projectors.append((fe.FastICA, config.fast_ica.params, 'fast_ica'))
+        feature_projectors.append((fe.FastICA, config.fast_ica.params, 'fast ica'))
     if config.factor_analysis.use:
-        feature_projectors.append((fe.FactorAnalysis, config.factor_analysis.params, 'factor_analysis'))
+        feature_projectors.append((fe.FactorAnalysis, config.factor_analysis.params, 'factor analysis'))
     if config.gaussian_random_projection.use:
         feature_projectors.append(
-            (fe.GaussianRandomProjection, config.gaussian_random_projection.params, 'grp'))
+            (fe.GaussianRandomProjection, config.gaussian_random_projection.params, 'gaussian random projection'))
     if config.sparse_random_projection.use:
-        feature_projectors.append((fe.SparseRandomProjection, config.sparse_random_projection.params, 'srp'))
+        feature_projectors.append((fe.SparseRandomProjection,
+                                   config.sparse_random_projection.params,
+                                   'sparse random projection'))
     return feature_projectors
 
 
diff --git a/src/pipeline_config.py b/src/pipeline_config.py
@@ -1,12 +1,11 @@
 import os
 
 from attrdict import AttrDict
-from deepsense import neptune
 
-from .utils import read_params, parameter_eval
+from .utils import NeptuneContext, parameter_eval
 
-ctx = neptune.Context()
-params = read_params(ctx, fallback_file='neptune.yaml')
+neptune_ctx = NeptuneContext()
+params = neptune_ctx.params
 
 RANDOM_SEED = 90210
 DEV_SAMPLE_SIZE = 500
diff --git a/src/pipeline_manager.py b/src/pipeline_manager.py
@@ -3,22 +3,24 @@
 
 import numpy as np
 import pandas as pd
-from deepsense import neptune
 from scipy.stats import gmean
 from sklearn.model_selection import train_test_split
 
 from . import pipeline_config as cfg
 from .pipelines import PIPELINES
-from .utils import init_logger, read_params, set_seed, create_submission, verify_submission, \
-    log_root_mean_squared_error, KFoldByTargetValue
+from .utils import init_logger, NeptuneContext, set_seed, \
+    create_submission, verify_submission, \
+    root_mean_squared_log_error, KFoldByTargetValue
+
+neptune_ctx = NeptuneContext()
+params = neptune_ctx.params
+ctx = neptune_ctx.ctx
 
 set_seed(cfg.RANDOM_SEED)
 logger = init_logger()
-ctx = neptune.Context()
-params = read_params(ctx, fallback_file='neptune.yaml')
 
 
-class PipelineManager():
+class PipelineManager:
     def train(self, pipeline_name, dev_mode):
         train(pipeline_name, dev_mode)
 
@@ -105,10 +107,10 @@ def evaluate(pipeline_name, dev_mode):
 
     y_pred = output['prediction']
 
-    logger.info('Calculating LRMSE on validation set')
-    score = log_root_mean_squared_error(y_true, y_pred)
-    logger.info('LRMSE score on validation is {}'.format(score))
-    ctx.channel_send('LRMSE', 0, score)
+    logger.info('Calculating RMSLE on validation set')
+    score = root_mean_squared_log_error(y_true, y_pred)
+    logger.info('RMSLE score on validation is {}'.format(score))
+    ctx.channel_send('RMSLE', 0, score)
 
 
 def predict(pipeline_name, dev_mode, submit_predictions):
@@ -149,9 +151,11 @@ def predict(pipeline_name, dev_mode, submit_predictions):
 
 
 def make_submission(submission_filepath):
-    logger.info('making Kaggle submit...')
-    os.system('kaggle competitions submit -c santander-value-prediction-challenge -f {} -m {}'
-              .format(submission_filepath, params.kaggle_message))
+    logger.info('Making Kaggle submit...')
+    os.system('kaggle competitions submit -c santander-value-prediction-challenge -f {} -m {}'.format(
+        submission_filepath,
+        params.kaggle_message))
+    logger.info('Kaggle submit completed')
 
 
 def train_evaluate_cv(pipeline_name, dev_mode):
@@ -161,7 +165,7 @@ def train_evaluate_cv(pipeline_name, dev_mode):
 
     logger.info('Reading data...')
     if dev_mode:
-        logger.info('running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE))
+        logger.info('Running in "dev-mode". Sample size is: {}'.format(cfg.DEV_SAMPLE_SIZE))
         train = pd.read_csv(params.train_filepath, nrows=cfg.DEV_SAMPLE_SIZE)
     else:
         train = pd.read_csv(params.train_filepath)
@@ -181,16 +185,16 @@ def train_evaluate_cv(pipeline_name, dev_mode):
 
         score, _, _ = _fold_fit_evaluate_loop(train_data_split, valid_data_split, fold_id, pipeline_name)
 
-        logger.info('Fold {} LRMSE {}'.format(fold_id, score))
-        ctx.channel_send('Fold {} LRMSE'.format(fold_id), 0, score)
+        logger.info('Fold {} RMSLE {}'.format(fold_id, score))
+        ctx.channel_send('Fold {} RMSLE'.format(fold_id), 0, score)
 
         fold_scores.append(score)
 
     score_mean, score_std = np.mean(fold_scores), np.std(fold_scores)
 
-    logger.info('LRMSE mean {}, LRMSE std {}'.format(score_mean, score_std))
-    ctx.channel_send('LRMSE', 0, score_mean)
-    ctx.channel_send('LRMSE STD', 0, score_std)
+    logger.info('RMSLE mean {}, RMSLE std {}'.format(score_mean, score_std))
+    ctx.channel_send('RMSLE', 0, score_mean)
+    ctx.channel_send('RMSLE STD', 0, score_std)
 
 
 def train_evaluate_predict_cv(pipeline_name, dev_mode, submit_predictions):
@@ -224,8 +228,8 @@ def train_evaluate_predict_cv(pipeline_name, dev_mode, submit_predictions):
                                                                                          valid_data_split, test,
                                                                                          fold_id, pipeline_name)
 
-        logger.info('Fold {} LRMSE {}'.format(fold_id, score))
-        ctx.channel_send('Fold {} LRMSE'.format(fold_id), 0, score)
+        logger.info('Fold {} RMSLE {}'.format(fold_id, score))
+        ctx.channel_send('Fold {} RMSLE'.format(fold_id), 0, score)
 
         out_of_fold_train_predictions.append(out_of_fold_prediction)
         out_of_fold_test_predictions.append(test_prediction)
@@ -237,9 +241,9 @@ def train_evaluate_predict_cv(pipeline_name, dev_mode, submit_predictions):
     test_prediction_aggregated = _aggregate_test_prediction(out_of_fold_test_predictions)
     score_mean, score_std = np.mean(fold_scores), np.std(fold_scores)
 
-    logger.info('LRMSE mean {}, LRMSE std {}'.format(score_mean, score_std))
-    ctx.channel_send('LRMSE', 0, score_mean)
-    ctx.channel_send('LRMSE STD', 0, score_std)
+    logger.info('RMSLE mean {}, RMSLE std {}'.format(score_mean, score_std))
+    ctx.channel_send('RMSLE', 0, score_mean)
+    ctx.channel_send('RMSLE STD', 0, score_std)
 
     logger.info('Saving predictions')
     out_of_fold_train_predictions.to_csv(os.path.join(params.experiment_directory,
@@ -317,7 +321,7 @@ def _fold_fit_evaluate_loop(train_data_split, valid_data_split, fold_id, pipelin
 
     y_valid_pred = output_valid['prediction']
     y_valid_true = valid_data_split[cfg.TARGET_COLUMN].values
-    score = log_root_mean_squared_error(y_valid_true, y_valid_pred)
+    score = root_mean_squared_log_error(y_valid_true, y_valid_pred)
 
     return score, y_valid_pred, pipeline
 
diff --git a/src/utils.py b/src/utils.py
@@ -7,11 +7,52 @@
 import pandas as pd
 import yaml
 from attrdict import AttrDict
+from deepsense import neptune
 from sklearn.metrics import mean_squared_error
 from sklearn.model_selection import BaseCrossValidator
 from steppy.base import BaseTransformer
 
 
+# Alex Martelli's 'Borg'
+# http://python-3-patterns-idioms-test.readthedocs.io/en/latest/Singleton.html
+class _Borg:
+    _shared_state = {}
+
+    def __init__(self):
+        self.__dict__ = self._shared_state
+
+
+class NeptuneContext(_Borg):
+    def __init__(self, fallback_file='neptune_local.yaml'):
+        _Borg.__init__(self)
+
+        self.ctx = neptune.Context()
+        self.fallback_file = fallback_file
+        self.params = self._read_params()
+        self.numeric_channel = neptune.ChannelType.NUMERIC
+        self.image_channel = neptune.ChannelType.IMAGE
+        self.text_channel = neptune.ChannelType.TEXT
+
+    def _read_params(self):
+        if self.ctx.params.__class__.__name__ == 'OfflineContextParams':
+            params = self._read_yaml().parameters
+        else:
+            params = self.ctx.params
+        return params
+
+    def _read_yaml(self):
+        with open(self.fallback_file) as f:
+            config = yaml.load(f)
+        return AttrDict(config)
+
+
+def parameter_eval(param):
+    try:
+        return eval(param)
+    except Exception:
+        return param
+
+
 def create_submission(meta, predictions):
     submission = pd.DataFrame({'ID': meta['ID'].tolist(),
                                'target': predictions
@@ -50,28 +91,6 @@ def init_logger():
     return logger
 
 
-def read_params(ctx, fallback_file):
-    if ctx.params.__class__.__name__ == 'OfflineContextParams':
-        neptune_config = read_yaml(fallback_file)
-        params = neptune_config.parameters
-    else:
-        params = ctx.params
-    return params
-
-
-def read_yaml(filepath):
-    with open(filepath) as f:
-        config = yaml.load(f)
-    return AttrDict(config)
-
-
-def parameter_eval(param):
-    try:
-        return eval(param)
-    except Exception:
-        return param
-
-
 def persist_evaluation_predictions(experiment_directory, y_pred, raw_data, id_column, target_column):
     raw_data.loc[:, 'y_pred'] = y_pred.reshape(-1)
     predictions_df = raw_data.loc[:, [id_column, target_column, 'y_pred']]
@@ -103,7 +122,7 @@ def root_mean_squared_error(y_true, y_pred):
     return np.sqrt(mean_squared_error(y_true, y_pred))
 
 
-def log_root_mean_squared_error(y_true, y_pred):
+def root_mean_squared_log_error(y_true, y_pred):
     return np.sqrt(mean_squared_error(np.log(1 + y_true), np.log(1 + y_pred)))