diff --git a/.travis.yml b/.travis.yml index 60b7fb6f..2c41cd20 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,11 @@ dist: bionic python: - 3.7 +# blocklist +branches: + except: + - optimization-development + # ===== Build for each OS ====== matrix: include: @@ -23,8 +28,12 @@ matrix: - git clone --single-branch --branch develop https://github.com/MStarmans91/WORCTutorial # 'python' points to Python 2.7 on macOS but points to Python 3.8 on Linux and Windows # 'python3' is a 'command not found' error on Windows but 'py' works on Windows only. - script: + script: - python WORCTutorial/WORCTutorialSimple.py + - fastr trace "C:\Users\travis\AppData\Local\Temp\WORC_Example_STWStrategyHN\__sink_data__.json" --sinks features_train_CT_0_predict --samples HN1331 + - fastr trace "C:\Users\travis\AppData\Local\Temp\WORC_Example_STWStrategyHN\__sink_data__.json" --sinks classification --samples all + - fastr trace "C:\Users\travis\AppData\Local\Temp\WORC_Example_STWStrategyHN\__sink_data__.json" --sinks classification --samples all + - fastr trace "C:\Users\travis\AppData\Local\Temp\GS\DEBUG_0\tmp\__sink_data__.json" --sinks output --samples id_0__0__0 - name: "Linux" before_install: - sudo apt-get -qq update @@ -41,6 +50,7 @@ matrix: - python WORCTutorial/WORCTutorialSimple.py - fastr trace /tmp/WORC_Example_STWStrategyHN/__sink_data__.json --sinks features_train_CT_0_predict --samples HN1331 - fastr trace /tmp/WORC_Example_STWStrategyHN/__sink_data__.json --sinks classification --samples all + - fastr trace /tmp/GS/DEBUG_0/tmp/__sink_data__.json --sinks output --samples id_0__0__0 notifications: slack: diff --git a/CHANGELOG b/CHANGELOG index 78a80657..e2fdd282 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -6,6 +6,48 @@ All notable changes to this project will be documented in this file. The format is based on `Keep a Changelog `_ and this project adheres to `Semantic Versioning `_ +3.3.0 - 2020-07-28 +------------------ + +Added +~~~~~~~ +- Graphviz vizualization of network is now nicely grouped. +- Properly integrated ObjectSampler: various resampling options now available. +- Verbose option to fit and score tool +- Validator for PyRadiomics output. +- FAQ version to documentation + +Changed +~~~~~~~ +- Upgraded to new versions of sklearn (0.23.1) and imbalanced learn (0.7.0) +- Some defaults, based on computation time. +- Do not skip workflow if feature selection selects zero features, + but disable the feature selection. +- Do not skip workflow if resampling is unsuccesfull, + but disable the resampling. +- Default scaling is now not only Z-score, but also MinMax and Robust +- Renamed plot SVM function and all functions using it, as now + we use all kinds of estimators. +- L1 penalty does not work with new standard LR solver. Removed L1 penalty. + +Fixed +~~~~~ +- Bug when using both elastix and segmentix. +- Bug when using elastix in train-test workflow. +- IMPORTANT: Previously, all methods except the machine learning where fit on + both the training and validation set together in fitandscore. This led + to overfitting on the validation set. Now, these are properly split. +- Bugfix in Evaluate standalone for decompositon tool. +- Applied imputation in decomposition if NaNs are detected. +- In the facade ConfigBuilder, an error is raised when incorrect + overrides are given. +- Bugfix in statistical feature test plotting. +- Bugfix in Evaluate when using ComBat +- Bugfix in feature converter of PyRadiomics when using 2D images. +- Catch Graphviz error. +- Bug in ICC. + + 3.2.2 - 2020-07-14 ------------------ @@ -431,6 +473,6 @@ Fixed - For multiple modalities, add only optional sources like metadata when present. 1.0.0rc1 - 2017-05-08 ------------------- +--------------------- First release diff --git a/README.md b/README.md index aba49b6d..c305c5bf 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ -# WORC v3.2.2 +# WORC v3.3.0 ## Workflow for Optimal Radiomics Classification ## Information | Linux | Windows | Documentation | PyPi |Citing WORC | |--------------------------------|-------------------------------|-------------------------------|-------------------------------|---------------------| -| [![][tci-linx]][tci-linx-lnk] | [![][tci-wind]][tci-wind-lnk] | [![][doc]][doc-lnk] | [![][pypi]][pypi-lnk] | [![][DOI]][DOI-lnk] | +| [![][tci-linx]][tci-linx-lnk] | [![][tci-wind]][tci-wind-lnk] | [![][doc]][doc-lnk] | [![][pypi]][pypi-lnk] | [![][DOI]][DOI-lnk] | [tci-linx]: https://travis-ci.com/MStarmans91/WORC.svg?token=qyvaeq7Cpwu7hJGB98Gp&branch=master&job=1 [tci-linx-lnk]: https://travis-ci.com/MStarmans91/WORC diff --git a/README.rst b/README.rst index 8d400e54..ad71550d 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -WORC v3.2.2 +WORC v3.3.0 =========== Workflow for Optimal Radiomics Classification diff --git a/WORC/IOparser/config_io_classifier.py b/WORC/IOparser/config_io_classifier.py index 15854f17..f4819147 100644 --- a/WORC/IOparser/config_io_classifier.py +++ b/WORC/IOparser/config_io_classifier.py @@ -22,14 +22,14 @@ def load_config(config_file_path): - """ - Load the config ini, parse settings to WORC + """Load the config ini, parse settings to WORC. Args: config_file_path (String): path of the .ini config file Returns: settings_dict (dict): dict with the loaded settings + """ if not os.path.exists(config_file_path): e = f'File {config_file_path} does not exist!' @@ -42,7 +42,7 @@ def load_config(config_file_path): 'Labels': dict(), 'HyperOptimization': dict(), 'Classification': dict(), 'SelectFeatGroup': dict(), 'Featsel': dict(), 'FeatureScaling': dict(), - 'SampleProcessing': dict(), 'Imputation': dict(), + 'Resampling': dict(), 'Imputation': dict(), 'Ensemble': dict(), 'Bootstrap': dict(), 'FeatPreProcess': dict(), 'Evaluation': dict()} @@ -58,6 +58,13 @@ def load_config(config_file_path): settings_dict['General']['tempsave'] =\ settings['General'].getboolean('tempsave') + # Feature Scaling + settings_dict['FeatureScaling']['scale_features'] =\ + settings['FeatureScaling'].getboolean('scale_features') + settings_dict['FeatureScaling']['scaling_method'] =\ + str(settings['FeatureScaling']['scaling_method']) + + # Feature selection settings_dict['Featsel']['Variance'] =\ settings['Featsel'].getfloat('Variance') @@ -130,6 +137,30 @@ def load_config(config_file_path): [str(item).strip() for item in settings['SelectFeatGroup'][key].split(',')] + # Settings for sample processing, i.e. oversampling, undersampling etc + settings_dict['Resampling']['Use'] =\ + settings['Resampling'].getfloat('Use') + + settings_dict['Resampling']['Method'] =\ + [str(item).strip() for item in + settings['Resampling']['Method'].split(',')] + + settings_dict['Resampling']['sampling_strategy'] =\ + [str(item).strip() for item in + settings['Resampling']['sampling_strategy'].split(',')] + + settings_dict['Resampling']['n_neighbors'] =\ + [int(str(item).strip()) for item in + settings['Resampling']['n_neighbors'].split(',')] + + settings_dict['Resampling']['k_neighbors'] =\ + [int(str(item).strip()) for item in + settings['Resampling']['k_neighbors'].split(',')] + + settings_dict['Resampling']['threshold_cleaning'] =\ + [float(str(item).strip()) for item in + settings['Resampling']['threshold_cleaning'].split(',')] + # Classification options settings_dict['Classification']['fastr'] =\ settings['Classification'].getboolean('fastr') @@ -257,28 +288,6 @@ def load_config(config_file_path): settings_dict['HyperOptimization']['ranking_score'] = \ str(settings['HyperOptimization']['ranking_score']) - settings_dict['FeatureScaling']['scale_features'] =\ - settings['FeatureScaling'].getboolean('scale_features') - settings_dict['FeatureScaling']['scaling_method'] =\ - str(settings['FeatureScaling']['scaling_method']) - - # Settings for sample processing, i.e. oversampling, undersampling etc - settings_dict['SampleProcessing']['SMOTE'] =\ - [str(item).strip() for item in - settings['SampleProcessing']['SMOTE'].split(',')] - - settings_dict['SampleProcessing']['SMOTE_ratio'] =\ - [int(str(item).strip()) for item in - settings['SampleProcessing']['SMOTE_ratio'].split(',')] - - settings_dict['SampleProcessing']['SMOTE_neighbors'] =\ - [int(str(item).strip()) for item in - settings['SampleProcessing']['SMOTE_neighbors'].split(',')] - - settings_dict['SampleProcessing']['Oversampling'] =\ - [str(item).strip() for item in - settings['SampleProcessing']['Oversampling'].split(',')] - # Settings for ensembling settings_dict['Ensemble']['Use'] =\ settings['Ensemble'].getint('Use') diff --git a/WORC/WORC.py b/WORC/WORC.py index 3823b487..991a1581 100644 --- a/WORC/WORC.py +++ b/WORC/WORC.py @@ -15,21 +15,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -import configparser -import fastr -from fastr.api import ResourceLimit import os -from random import randint +import yaml +import fastr import graphviz -import WORC.addexceptions as WORCexceptions -import WORC.IOparser.config_WORC as config_io +import configparser +from pathlib import Path +from random import randint +import WORC.IOparser.file_io as io +from fastr.api import ResourceLimit +from WORC.tools.Slicer import Slicer from WORC.tools.Elastix import Elastix from WORC.tools.Evaluate import Evaluate -from WORC.tools.Slicer import Slicer +import WORC.addexceptions as WORCexceptions +import WORC.IOparser.config_WORC as config_io from WORC.detectors.detectors import DebugDetector -from pathlib import Path -import yaml -import WORC.IOparser.file_io as io class WORC(object): @@ -104,9 +104,7 @@ class WORC(object): CopyMetadata: Boolean, default True when using elastix, copy metadata from image to segmentation or not - """ - def __init__(self, name='test'): """Initialize WORC object. @@ -185,14 +183,21 @@ def defaultconfig(self): config['General']['Segmentix'] = 'True' config['General']['FeatureCalculators'] = '[predict/CalcFeatures:1.0, pyradiomics/Pyradiomics:1.0]' config['General']['Preprocessing'] = 'worc/PreProcess:1.0' - config['General']['RegistrationNode'] = "'elastix4.8/Elastix:4.8'" - config['General']['TransformationNode'] = "'elastix4.8/Transformix:4.8'" + config['General']['RegistrationNode'] = "elastix4.8/Elastix:4.8" + config['General']['TransformationNode'] = "elastix4.8/Transformix:4.8" config['General']['Joblib_ncores'] = '1' config['General']['Joblib_backend'] = 'threading' config['General']['tempsave'] = 'False' config['General']['AssumeSameImageAndMaskMetadata'] = 'False' config['General']['ComBat'] = 'False' + # Options for the object/patient labels that are used + config['Labels'] = dict() + config['Labels']['label_names'] = 'Label1, Label2' + config['Labels']['modus'] = 'singlelabel' + config['Labels']['url'] = 'WIP' + config['Labels']['projectID'] = 'WIP' + # Preprocessing config['Preprocessing'] = dict() config['Preprocessing']['Normalize'] = 'True' @@ -298,12 +303,24 @@ def defaultconfig(self): config['ComBat'] = dict() config['ComBat']['language'] = 'python' config['ComBat']['batch'] = 'Hospital' + config['ComBat']['mod'] = '[]' config['ComBat']['par'] = '1' config['ComBat']['eb'] = '1' config['ComBat']['per_feature'] = '0' config['ComBat']['excluded_features'] = 'sf_, of_, semf_, pf_' config['ComBat']['matlab'] = 'C:\\Program Files\\MATLAB\\R2015b\\bin\\matlab.exe' + # Feature imputation + config['Imputation'] = dict() + config['Imputation']['use'] = 'True' + config['Imputation']['strategy'] = 'mean, median, most_frequent, constant, knn' + config['Imputation']['n_neighbors'] = '5, 5' + + # Feature scaling options + config['FeatureScaling'] = dict() + config['FeatureScaling']['scale_features'] = 'True' + config['FeatureScaling']['scaling_method'] = 'z_score, robust, minmax' + # Feature preprocessing before all below takes place config['FeatPreProcess'] = dict() config['FeatPreProcess']['Use'] = 'False' @@ -356,11 +373,17 @@ def defaultconfig(self): config['SelectFeatGroup']['wavelet_features'] = 'True, False' config['SelectFeatGroup']['log_features'] = 'True, False' - # Feature imputation - config['Imputation'] = dict() - config['Imputation']['use'] = 'True' - config['Imputation']['strategy'] = 'mean, median, most_frequent, constant, knn' - config['Imputation']['n_neighbors'] = '5, 5' + # Resampling options + config['Resampling'] = dict() + config['Resampling']['Use'] = '0.20' + config['Resampling']['Method'] =\ + 'RandomUnderSampling, RandomOverSampling, NearMiss, ' +\ + 'NeighbourhoodCleaningRule, ADASYN, BorderlineSMOTE, SMOTE, ' +\ + 'SMOTEENN, SMOTETomek' + config['Resampling']['sampling_strategy'] = 'auto, majority, not minority, not majority, all' + config['Resampling']['n_neighbors'] = '3, 12' + config['Resampling']['k_neighbors'] = '5, 15' + config['Resampling']['threshold_cleaning'] = '0.25, 0.5' # Classification config['Classification'] = dict() @@ -376,7 +399,7 @@ def defaultconfig(self): config['Classification']['RFn_estimators'] = '10, 90' config['Classification']['RFmin_samples_split'] = '2, 3' config['Classification']['RFmax_depth'] = '5, 5' - config['Classification']['LRpenalty'] = 'l2, l1' + config['Classification']['LRpenalty'] = 'l2' config['Classification']['LRC'] = '0.01, 1.0' config['Classification']['LDA_solver'] = 'svd, lsqr, eigen' config['Classification']['LDA_shrinkage'] = '-5, 5' @@ -395,36 +418,16 @@ def defaultconfig(self): config['CrossValidation']['test_size'] = '0.2' config['CrossValidation']['fixed_seed'] = 'False' - # Options for the object/patient labels that are used - config['Labels'] = dict() - config['Labels']['label_names'] = 'Label1, Label2' - config['ComBat']['mod'] = config['Labels']['label_names'] # Variation due to label to predict should be maintained - config['Labels']['modus'] = 'singlelabel' - config['Labels']['url'] = 'WIP' - config['Labels']['projectID'] = 'WIP' - # Hyperparameter optimization options config['HyperOptimization'] = dict() config['HyperOptimization']['scoring_method'] = 'f1_weighted' config['HyperOptimization']['test_size'] = '0.15' config['HyperOptimization']['n_splits'] = '5' - config['HyperOptimization']['N_iterations'] = '10000' - config['HyperOptimization']['n_jobspercore'] = '2000' # only relevant when using fastr in classification + config['HyperOptimization']['N_iterations'] = '25000' + config['HyperOptimization']['n_jobspercore'] = '1000' # only relevant when using fastr in classification config['HyperOptimization']['maxlen'] = '100' config['HyperOptimization']['ranking_score'] = 'test_score' - # Feature scaling options - config['FeatureScaling'] = dict() - config['FeatureScaling']['scale_features'] = 'True' - config['FeatureScaling']['scaling_method'] = 'z_score' - - # Sample processing options - config['SampleProcessing'] = dict() - config['SampleProcessing']['SMOTE'] = 'True, False' - config['SampleProcessing']['SMOTE_ratio'] = '1, 0' - config['SampleProcessing']['SMOTE_neighbors'] = '5, 15' - config['SampleProcessing']['Oversampling'] = 'False' - # Ensemble options config['Ensemble'] = dict() config['Ensemble']['Use'] = '50' @@ -489,20 +492,24 @@ def build_training(self): image_types.append(self.configs[c]['ImageFeatures']['image_type']) # Create config source - self.source_class_config = self.network.create_source('ParameterFile', id='config_classification_source', node_group='conf') + self.source_class_config = self.network.create_source('ParameterFile', id='config_classification_source', node_group='conf', step_id='general_sources') # Classification tool and label source - self.source_patientclass_train = self.network.create_source('PatientInfoFile', id='patientclass_train', node_group='pctrain') + self.source_patientclass_train = self.network.create_source('PatientInfoFile', id='patientclass_train', node_group='pctrain', step_id='train_sources') if self.labels_test: - self.source_patientclass_test = self.network.create_source('PatientInfoFile', id='patientclass_test', node_group='pctest') + self.source_patientclass_test = self.network.create_source('PatientInfoFile', id='patientclass_test', node_group='pctest', step_id='test_sources') memory = self.fastr_memory_parameters['Classification'] - self.classify = self.network.create_node('worc/TrainClassifier:1.0', tool_version='1.0', id='classify', resources=ResourceLimit(memory=memory)) + self.classify = self.network.create_node('worc/TrainClassifier:1.0', + tool_version='1.0', + id='classify', + resources=ResourceLimit(memory=memory), + step_id='WorkflowOptimization') # Outputs - self.sink_classification = self.network.create_sink('HDF5', id='classification') - self.sink_performance = self.network.create_sink('JsonFile', id='performance') - self.sink_class_config = self.network.create_sink('ParameterFile', id='config_classification_sink', node_group='conf') + self.sink_classification = self.network.create_sink('HDF5', id='classification', step_id='general_sinks') + self.sink_performance = self.network.create_sink('JsonFile', id='performance', step_id='general_sinks') + self.sink_class_config = self.network.create_sink('ParameterFile', id='config_classification_sink', node_group='conf', step_id='general_sinks') # Links self.sink_class_config.input = self.source_class_config.output @@ -631,38 +638,62 @@ def build_training(self): self.modlabels.append(label) # Create required sources and sinks - self.sources_parameters[label] = self.network.create_source('ParameterFile', id='config_' + label) - self.sources_images_train[label] = self.network.create_source('ITKImageFile', id='images_train_' + label, node_group='train') + self.sources_parameters[label] = self.network.create_source('ParameterFile', id='config_' + label, step_id='general_sources') + self.sources_images_train[label] = self.network.create_source('ITKImageFile', id='images_train_' + label, node_group='train', step_id='train_sources') if self.TrainTest: - self.sources_images_test[label] = self.network.create_source('ITKImageFile', id='images_test_' + label, node_group='test') + self.sources_images_test[label] = self.network.create_source('ITKImageFile', id='images_test_' + label, node_group='test', step_id='test_sources') if self.metadata_train and len(self.metadata_train) >= nmod + 1: - self.sources_metadata_train[label] = self.network.create_source('DicomImageFile', id='metadata_train_' + label, node_group='train') + self.sources_metadata_train[label] = self.network.create_source('DicomImageFile', id='metadata_train_' + label, node_group='train', step_id='train_sources') if self.metadata_test and len(self.metadata_test) >= nmod + 1: - self.sources_metadata_test[label] = self.network.create_source('DicomImageFile', id='metadata_test_' + label, node_group='test') + self.sources_metadata_test[label] = self.network.create_source('DicomImageFile', id='metadata_test_' + label, node_group='test', step_id='test_sources') if self.masks_train and len(self.masks_train) >= nmod + 1: # Create mask source and convert - self.sources_masks_train[label] = self.network.create_source('ITKImageFile', id='mask_train_' + label, node_group='train') + self.sources_masks_train[label] = self.network.create_source('ITKImageFile', id='mask_train_' + label, node_group='train', step_id='train_sources') memory = self.fastr_memory_parameters['WORCCastConvert'] - self.converters_masks_train[label] = self.network.create_node('worc/WORCCastConvert:0.3.2', tool_version='0.1', id='convert_mask_train_' + label, node_group='train', resources=ResourceLimit(memory=memory)) + self.converters_masks_train[label] =\ + self.network.create_node('worc/WORCCastConvert:0.3.2', + tool_version='0.1', + id='convert_mask_train_' + label, + node_group='train', + resources=ResourceLimit(memory=memory), + step_id='FileConversion') + self.converters_masks_train[label].inputs['image'] = self.sources_masks_train[label].output if self.masks_test and len(self.masks_test) >= nmod + 1: # Create mask source and convert - self.sources_masks_test[label] = self.network.create_source('ITKImageFile', id='mask_test_' + label, node_group='test') + self.sources_masks_test[label] = self.network.create_source('ITKImageFile', id='mask_test_' + label, node_group='test', step_id='test_sources') memory = self.fastr_memory_parameters['WORCCastConvert'] - self.converters_masks_test[label] = self.network.create_node('worc/WORCCastConvert:0.3.2', tool_version='0.1', id='convert_mask_test_' + label, node_group='test', resources=ResourceLimit(memory=memory)) + self.converters_masks_test[label] =\ + self.network.create_node('worc/WORCCastConvert:0.3.2', + tool_version='0.1', + id='convert_mask_test_' + label, + node_group='test', + resources=ResourceLimit(memory=memory), + step_id='FileConversion') + self.converters_masks_test[label].inputs['image'] = self.sources_masks_test[label].output # First convert the images if any(modality in mod for modality in ['MR', 'CT', 'MG', 'PET']): # Use WORC PXCastConvet for converting image formats memory = self.fastr_memory_parameters['WORCCastConvert'] - self.converters_im_train[label] = self.network.create_node('worc/WORCCastConvert:0.3.2', tool_version='0.1', id='convert_im_train_' + label, resources=ResourceLimit(memory=memory)) + self.converters_im_train[label] =\ + self.network.create_node('worc/WORCCastConvert:0.3.2', + tool_version='0.1', + id='convert_im_train_' + label, + resources=ResourceLimit(memory=memory), + step_id='FileConversion') if self.TrainTest: - self.converters_im_test[label] = self.network.create_node('worc/WORCCastConvert:0.3.2', tool_version='0.1', id='convert_im_test_' + label, resources=ResourceLimit(memory=memory)) + self.converters_im_test[label] =\ + self.network.create_node('worc/WORCCastConvert:0.3.2', + tool_version='0.1', + id='convert_im_test_' + label, + resources=ResourceLimit(memory=memory), + step_id='FileConversion') else: raise WORCexceptions.WORCTypeError(('No valid image type for modality {}: {} provided.').format(str(nmod), mod)) @@ -705,13 +736,15 @@ def build_training(self): self.sources_segmentations_train[label] =\ self.network.create_source('ITKImageFile', id='segmentations_train_' + label, - node_group='train') + node_group='train', + step_id='train_sources') self.converters_seg_train[label] =\ self.network.create_node('worc/WORCCastConvert:0.3.2', tool_version='0.1', id='convert_seg_train_' + label, - resources=ResourceLimit(memory=memory)) + resources=ResourceLimit(memory=memory), + step_id='FileConversion') self.converters_seg_train[label].inputs['image'] =\ self.sources_segmentations_train[label].output @@ -720,13 +753,15 @@ def build_training(self): self.sources_segmentations_test[label] =\ self.network.create_source('ITKImageFile', id='segmentations_test_' + label, - node_group='test') + node_group='test', + step_id='test_sources') self.converters_seg_test[label] =\ self.network.create_node('worc/WORCCastConvert:0.3.2', tool_version='0.1', id='convert_seg_test_' + label, - resources=ResourceLimit(memory=memory)) + resources=ResourceLimit(memory=memory), + step_id='FileConversion') self.converters_seg_test[label].inputs['image'] =\ self.sources_segmentations_test[label].output @@ -796,7 +831,7 @@ def build_training(self): for i_node, fname in enumerate(self.featurecalculators[label]): # Create sink for feature outputs - self.sinks_features_train[label].append(self.network.create_sink('HDF5', id='features_train_' + label + '_' + fname)) + self.sinks_features_train[label].append(self.network.create_sink('HDF5', id='features_train_' + label + '_' + fname, step_id='train_sinks')) # Append features to the classification if not self.configs[0]['General']['ComBat'] == 'True': @@ -809,7 +844,7 @@ def build_training(self): # Similar for testing workflow if self.TrainTest: # Create sink for feature outputs - self.sinks_features_test[label].append(self.network.create_sink('HDF5', id='features_test_' + label + '_' + fname)) + self.sinks_features_test[label].append(self.network.create_sink('HDF5', id='features_test_' + label + '_' + fname, step_id='test_sinks')) # Append features to the classification if not self.configs[0]['General']['ComBat'] == 'True': @@ -840,7 +875,7 @@ def build_training(self): self.modlabels.append(label) # Create a node for the feature computation - self.sources_features_train[label] = self.network.create_source('HDF5', id='features_train_' + label, node_group='train') + self.sources_features_train[label] = self.network.create_source('HDF5', id='features_train_' + label, node_group='train', step_id='train_sources') # Add the features from this modality to the classifier node input self.links_C1_train[label] = self.classify.inputs['features_train'][str(label)] << self.sources_features_train[label].output @@ -848,7 +883,7 @@ def build_training(self): if self.features_test: # Create a node for the feature computation - self.sources_features_test[label] = self.network.create_source('HDF5', id='features_test_' + label, node_group='test') + self.sources_features_test[label] = self.network.create_source('HDF5', id='features_test_' + label, node_group='test', step_id='test_sources') # Add the features from this modality to the classifier node input self.links_C1_test[label] = self.classify.inputs['features_test'][str(label)] << self.sources_features_test[label].output @@ -869,10 +904,11 @@ def add_ComBat(self): self.network.create_node('combat/ComBat:1.0', tool_version='1.0', id='ComBat', - resources=ResourceLimit(memory=memory)) + resources=ResourceLimit(memory=memory), + step_id='ComBat') # Create sink for ComBat output - self.sinks_features_train_ComBat = self.network.create_sink('HDF5', id='features_train_ComBat') + self.sinks_features_train_ComBat = self.network.create_sink('HDF5', id='features_train_ComBat', step_id='ComBat') # Create links for inputs self.link_combat_1 = self.network.create_link(self.source_class_config.output, self.ComBat.inputs['config']) @@ -889,7 +925,7 @@ def add_ComBat(self): if self.TrainTest: # Create sink for ComBat output - self.sinks_features_test_ComBat = self.network.create_sink('HDF5', id='features_test_ComBat') + self.sinks_features_test_ComBat = self.network.create_sink('HDF5', id='features_test_ComBat', step_id='ComBat') # Create links for inputs self.link_combat_3 = self.network.create_link(self.source_patientclass_test.output, self.ComBat.inputs['patientclass_test']) @@ -903,9 +939,9 @@ def add_ComBat(self): def add_preprocessing(self, preprocess_node, label, nmod): """Add nodes required for preprocessing of images.""" memory = self.fastr_memory_parameters['Preprocessing'] - self.preprocessing_train[label] = self.network.create_node(preprocess_node, tool_version='1.0', id='preprocessing_train_' + label, resources=ResourceLimit(memory=memory)) + self.preprocessing_train[label] = self.network.create_node(preprocess_node, tool_version='1.0', id='preprocessing_train_' + label, resources=ResourceLimit(memory=memory), step_id='Preprocessing') if self.TrainTest: - self.preprocessing_test[label] = self.network.create_node(preprocess_node, tool_version='1.0', id='preprocessing_test_' + label, resources=ResourceLimit(memory=memory)) + self.preprocessing_test[label] = self.network.create_node(preprocess_node, tool_version='1.0', id='preprocessing_test_' + label, resources=ResourceLimit(memory=memory), step_id='Preprocessing') # Create required links self.preprocessing_train[label].inputs['parameters'] = self.sources_parameters[label].output @@ -923,11 +959,11 @@ def add_preprocessing(self, preprocess_node, label, nmod): # If there are masks to use in normalization, add them here if self.masks_normalize_train: - self.sources_masks_normalize_train[label] = self.network.create_source('ITKImageFile', id='masks_normalize_train_' + label, node_group='train') + self.sources_masks_normalize_train[label] = self.network.create_source('ITKImageFile', id='masks_normalize_train_' + label, node_group='train', step_id='Preprocessing') self.preprocessing_train[label].inputs['mask'] = self.sources_masks_normalize_train[label].output if self.masks_normalize_test: - self.sources_masks_normalize_test[label] = self.network.create_source('ITKImageFile', id='masks_normalize_test_' + label, node_group='test') + self.sources_masks_normalize_test[label] = self.network.create_source('ITKImageFile', id='masks_normalize_test_' + label, node_group='test', step_id='Preprocessing') self.preprocessing_test[label].inputs['mask'] = self.sources_masks_normalize_test[label].output def add_feature_calculator(self, calcfeat_node, label, nmod): @@ -942,14 +978,16 @@ def add_feature_calculator(self, calcfeat_node, label, nmod): self.network.create_node(calcfeat_node, tool_version='1.0', id='calcfeatures_train_' + node_ID, - resources=ResourceLimit(memory=memory)) + resources=ResourceLimit(memory=memory), + step_id='Feature_Extraction') if self.TrainTest: node_test =\ self.network.create_node(calcfeat_node, tool_version='1.0', id='calcfeatures_test_' + node_ID, - resources=ResourceLimit(memory=memory)) + resources=ResourceLimit(memory=memory), + step_id='Feature_Extraction') # Check if we need to add pyradiomics specific sources if 'pyradiomics' in calcfeat_node.lower(): @@ -957,14 +995,16 @@ def add_feature_calculator(self, calcfeat_node, label, nmod): self.source_config_pyradiomics[label] =\ self.network.create_source('YamlFile', id='config_pyradiomics_' + label, - node_group='train') + node_group='train', + step_id='Feature_Extraction') # Add a format source, which we are going to set to a constant # And attach to the tool node self.source_format_pyradiomics =\ self.network.create_constant('String', 'csv', id='format_pyradiomics_' + label, - node_group='train') + node_group='train', + step_id='Feature_Extraction') node_train.inputs['format'] =\ self.source_format_pyradiomics.output @@ -1009,7 +1049,8 @@ def add_feature_calculator(self, calcfeat_node, label, nmod): if self.semantics_train and len(self.semantics_train) >= nmod + 1: self.sources_semantics_train[label] =\ self.network.create_source('CSVFile', - id='semantics_train_' + label) + id='semantics_train_' + label, + step_id='train_sources') node_train.inputs['semantics'] =\ self.sources_semantics_train[label].output @@ -1017,7 +1058,8 @@ def add_feature_calculator(self, calcfeat_node, label, nmod): if self.semantics_test and len(self.semantics_test) >= nmod + 1: self.sources_semantics_test[label] =\ self.network.create_source('CSVFile', - id='semantics_test_' + label) + id='semantics_test_' + label, + step_id='test_sources') node_test.inputs['semantics'] =\ self.sources_semantics_test[label].output @@ -1026,7 +1068,8 @@ def add_feature_calculator(self, calcfeat_node, label, nmod): self.network.create_node('worc/FeatureConverter:1.0', tool_version='1.0', id='featureconverter_train_' + node_ID, - resources=ResourceLimit(memory='4G')) + resources=ResourceLimit(memory='4G'), + step_id='Feature_Extraction') conv_train.inputs['feat_in'] = node_train.outputs['features'] @@ -1041,7 +1084,8 @@ def add_feature_calculator(self, calcfeat_node, label, nmod): self.source_toolbox_name[label] =\ self.network.create_constant('String', toolbox, - id=f'toolbox_name_{toolbox}_{label}') + id=f'toolbox_name_{toolbox}_{label}', + step_id='Feature_Extraction') conv_train.inputs['toolbox'] = self.source_toolbox_name[label].output conv_train.inputs['config'] = self.sources_parameters[label].output @@ -1051,7 +1095,8 @@ def add_feature_calculator(self, calcfeat_node, label, nmod): self.network.create_node('worc/FeatureConverter:1.0', tool_version='1.0', id='featureconverter_test_' + node_ID, - resources=ResourceLimit(memory='4G')) + resources=ResourceLimit(memory='4G'), + step_id='Feature_Extraction') conv_test.inputs['feat_in'] = node_test.outputs['features'] conv_test.inputs['toolbox'] = self.source_toolbox_name[label].output @@ -1099,13 +1144,15 @@ def add_elastix(self, label, nmod): self.sources_segmentations_train[label] =\ self.network.create_source('ITKImageFile', id='segmentations_train_' + label, - node_group='input') + node_group='input', + step_id='train_sources') self.converters_seg_train[label] =\ self.network.create_node('worc/WORCCastConvert:0.3.2', tool_version='0.1', id='convert_seg_train_' + label, - resources=ResourceLimit(memory=memory)) + resources=ResourceLimit(memory=memory), + step_id='FileConversion') self.converters_seg_train[label].inputs['image'] =\ self.sources_segmentations_train[label].output @@ -1114,13 +1161,15 @@ def add_elastix(self, label, nmod): self.sources_segmentations_test[label] =\ self.network.create_source('ITKImageFile', id='segmentations_test_' + label, - node_group='input') + node_group='input', + step_id='test_sources') self.converters_seg_test[label] =\ self.network.create_node('worc/WORCCastConvert:0.3.2', tool_version='0.1', id='convert_seg_test_' + label, - resources=ResourceLimit(memory=memory)) + resources=ResourceLimit(memory=memory), + step_id='FileConversion') self.converters_seg_test[label].inputs['image'] =\ self.sources_segmentations_test[label].output @@ -1140,39 +1189,45 @@ def add_elastix(self, label, nmod): self.network.create_node(elastix_node, tool_version='0.2', id='elastix_train_' + label, - resources=ResourceLimit(memory=memory_elastix)) + resources=ResourceLimit(memory=memory_elastix), + step_id='Image_Registration') memory_transformix = self.fastr_memory_parameters['Elastix'] self.transformix_seg_nodes_train[label] =\ self.network.create_node(transformix_node, tool_version='0.2', id='transformix_seg_train_' + label, - resources=ResourceLimit(memory=memory_transformix)) + resources=ResourceLimit(memory=memory_transformix), + step_id='Image_Registration') self.transformix_im_nodes_train[label] =\ self.network.create_node(transformix_node, tool_version='0.2', id='transformix_im_train_' + label, - resources=ResourceLimit(memory=memory_transformix)) + resources=ResourceLimit(memory=memory_transformix), + step_id='Image_Registration') if self.TrainTest: self.elastix_nodes_test[label] =\ self.network.create_node(elastix_node, tool_version='0.2', id='elastix_test_' + label, - resources=ResourceLimit(memory=memory_elastix)) + resources=ResourceLimit(memory=memory_elastix), + step_id='Image_Registration') self.transformix_seg_nodes_test[label] =\ self.network.create_node(transformix_node, tool_version='0.2', id='transformix_seg_test_' + label, - resources=ResourceLimit(memory=memory_transformix)) + resources=ResourceLimit(memory=memory_transformix), + step_id='Image_Registration') self.transformix_im_nodes_test[label] =\ self.network.create_node(transformix_node, tool_version='0.2', id='transformix_im_test_' + label, - resources=ResourceLimit(memory=memory_transformix)) + resources=ResourceLimit(memory=memory_transformix), + step_id='Image_Registration') # Create sources_segmentation # M1 = moving, others = fixed @@ -1194,7 +1249,8 @@ def add_elastix(self, label, nmod): self.copymetadata_nodes_train[self.modlabels[0]] =\ self.network.create_node('itktools/0.3.2/CopyMetadata:1.0', tool_version='1.0', - id='CopyMetadata_train_' + self.modlabels[0]) + id='CopyMetadata_train_' + self.modlabels[0], + step_id='Image_Registration') self.copymetadata_nodes_train[self.modlabels[0]].inputs["source"] =\ self.converters_im_train[self.modlabels[0]].outputs['image'] @@ -1224,7 +1280,8 @@ def add_elastix(self, label, nmod): self.copymetadata_nodes_test[self.modlabels[0]] =\ self.network.create_node('itktools/0.3.2/CopyMetadata:1.0', tool_version='1.0', - id='CopyMetadata_test_' + self.modlabels[0]) + id='CopyMetadata_test_' + self.modlabels[0], + step_id='Image_Registration') self.copymetadata_nodes_test[self.modlabels[0]].inputs["source"] =\ self.converters_im_test[self.modlabels[0]].outputs['image'] @@ -1242,7 +1299,8 @@ def add_elastix(self, label, nmod): self.source_Elastix_Parameters[label] =\ self.network.create_source('ElastixParameterFile', id='Elastix_Para_' + label, - node_group='elpara') + node_group='elpara', + step_id='Image_Registration') self.link_elparam_train =\ self.network.create_link(self.source_Elastix_Parameters[label].output, @@ -1275,7 +1333,8 @@ def add_elastix(self, label, nmod): self.edittransformfile_nodes_train[label] =\ self.network.create_node('elastixtools/EditElastixTransformFile:0.1', tool_version='0.1', - id='EditElastixTransformFile' + label) + id='EditElastixTransformFile_train_' + label, + step_id='Image_Registration') self.edittransformfile_nodes_train[label].inputs['set'] =\ ["FinalBSplineInterpolationOrder=0"] @@ -1287,7 +1346,8 @@ def add_elastix(self, label, nmod): self.edittransformfile_nodes_test[label] =\ self.network.create_node('elastixtools/EditElastixTransformFile:0.1', tool_version='0.1', - id='EditElastixTransformFile' + label) + id='EditElastixTransformFile_test_' + label, + step_id='Image_Registration') self.edittransformfile_nodes_test[label].inputs['set'] =\ ["FinalBSplineInterpolationOrder=0"] @@ -1315,25 +1375,30 @@ def add_elastix(self, label, nmod): self.transformix_im_nodes_test[label].inputs['image'] =\ self.converters_im_test[self.modlabels[0]].outputs['image'] - for i_node in range(len(self.calcfeatures_train[label])): - self.calcfeatures_train[label][i_node].inputs['segmentation'] =\ - self.transformix_seg_nodes_train[label].outputs['image'] - if self.TrainTest: - self.calcfeatures_test[label][i_node].inputs['segmentation'] =\ - self.transformix_seg_nodes_test[label].outputs['image'] + if self.configs[nmod]['General']['Segmentix'] != 'True': + # These segmentations serve as input for the feature calculation + for i_node in range(len(self.calcfeatures_train[label])): + self.calcfeatures_train[label][i_node].inputs['segmentation'] =\ + self.transformix_seg_nodes_train[label].outputs['image'] + if self.TrainTest: + self.calcfeatures_test[label][i_node].inputs['segmentation'] =\ + self.transformix_seg_nodes_test[label].outputs['image'] # Save outputfor the training set self.sinks_transformations_train[label] =\ self.network.create_sink('ElastixTransformFile', - id='transformations_train_' + label) + id='transformations_train_' + label, + step_id='train_sinks') self.sinks_segmentations_elastix_train[label] =\ self.network.create_sink('ITKImageFile', - id='segmentations_out_elastix_train_' + label) + id='segmentations_out_elastix_train_' + label, + step_id='train_sinks') self.sinks_images_elastix_train[label] =\ self.network.create_sink('ITKImageFile', - id='images_out_elastix_train_' + label) + id='images_out_elastix_train_' + label, + step_id='train_sinks') self.sinks_transformations_train[label].input =\ self.elastix_nodes_train[label].outputs['transform'] @@ -1348,14 +1413,18 @@ def add_elastix(self, label, nmod): if self.TrainTest: self.sinks_transformations_test[label] =\ self.network.create_sink('ElastixTransformFile', - id='transformations_test_' + label) + id='transformations_test_' + label, + step_id='test_sinks') self.sinks_segmentations_elastix_test[label] =\ self.network.create_sink('ITKImageFile', - id='segmentations_out_elastix_test_' + label) + id='segmentations_out_elastix_test_' + label, + step_id='test_sinks') self.sinks_images_elastix_test[label] =\ - self.network.create_sink('ITKImageFile', id='images_out_elastix_test_' + label) - self.sinks_transformations_elastix_test[label].input =\ + self.network.create_sink('ITKImageFile', + id='images_out_elastix_test_' + label, + step_id='test_sinks') + self.sinks_transformations_test[label].input =\ self.elastix_nodes_test[label].outputs['transform'] self.sinks_segmentations_elastix_test[label].input =\ self.transformix_seg_nodes_test[label].outputs['image'] @@ -1370,14 +1439,16 @@ def add_segmentix(self, label, nmod): if label not in self.sinks_segmentations_segmentix_train: self.sinks_segmentations_segmentix_train[label] =\ self.network.create_sink('ITKImageFile', - id='segmentations_out_segmentix_train_' + label) + id='segmentations_out_segmentix_train_' + label, + step_id='train_sinks') memory = self.fastr_memory_parameters['Segmentix'] self.nodes_segmentix_train[label] =\ self.network.create_node('segmentix/Segmentix:1.0', tool_version='1.0', id='segmentix_train_' + label, - resources=ResourceLimit(memory=memory)) + resources=ResourceLimit(memory=memory), + step_id='Preprocessing') # Input the image self.nodes_segmentix_train[label].inputs['image'] =\ @@ -1407,11 +1478,15 @@ def add_segmentix(self, label, nmod): if self.TrainTest: self.sinks_segmentations_segmentix_test[label] =\ self.network.create_sink('ITKImageFile', - id='segmentations_out_segmentix_test_' + label) + id='segmentations_out_segmentix_test_' + label, + step_id='test_sinks') + self.nodes_segmentix_test[label] =\ self.network.create_node('segmentix/Segmentix:1.0', tool_version='1.0', - id='segmentix_test_' + label, resources=ResourceLimit(memory=memory)) + id='segmentix_test_' + label, + resources=ResourceLimit(memory=memory), + step_id='Preprocessing') self.nodes_segmentix_test[label].inputs['image'] =\ self.converters_im_test[label].outputs['image'] @@ -1569,6 +1644,8 @@ def execute(self): self.network.draw(file_path=self.network.id + '.svg', draw_dimensions=True) except graphviz.backend.ExecutableNotFound: print('[WORC WARNING] Graphviz executable not found: not drawing network diagram. Make sure the Graphviz executables are on your systems PATH.') + except graphviz.backend.CalledProcessError as e: + print(f'[WORC WARNING] Graphviz executable gave an error: not drawing network diagram. Original error: {e}') if DebugDetector().do_detection(): print("Source Data:") diff --git a/WORC/classification/AdvancedSampler.py b/WORC/classification/AdvancedSampler.py index d45a7b6e..44399938 100644 --- a/WORC/classification/AdvancedSampler.py +++ b/WORC/classification/AdvancedSampler.py @@ -17,7 +17,7 @@ from sklearn.utils import check_random_state import numpy as np -from sklearn.externals import six +import six from ghalton import Halton # from sobol_seq import i4_sobol_generate as Sobol import scipy diff --git a/WORC/classification/ObjectSampler.py b/WORC/classification/ObjectSampler.py index 90729b62..6bb59e74 100644 --- a/WORC/classification/ObjectSampler.py +++ b/WORC/classification/ObjectSampler.py @@ -15,7 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from imblearn import over_sampling, under_sampling, combine, SMOTE +from imblearn import over_sampling, under_sampling, combine import numpy as np from sklearn.utils import check_random_state import WORC.addexceptions as ae @@ -34,45 +34,44 @@ class ObjectSampler(object): def __init__(self, method, sampling_strategy='auto', - ratio=1, - k_neighbors=5, - kind='borderline-1', n_jobs=1, n_neighbors=3, - threshold_cleaning=0.5 - ): + k_neighbors=5, + threshold_cleaning=0.5, + verbose=True): """Initialize object.""" # Initialize a random state self.random_seed = np.random.randint(5000) self.random_state = check_random_state(self.random_seed) # Initialize all objects as Nones: overriden when required by functions - self.sampling_strategy = None self.object = None - self.k_neighbors = None + self.sampling_strategy = None self.n_jobs = None - self.ratio = None + self.n_neighbors = None + self.k_neighbors = None + self.threshold_cleaning = None + self.verbose = verbose if method == 'RandomUnderSampling': self.init_RandomUnderSampling(sampling_strategy) elif method == 'NearMiss': - self.init_NearMiss(sampling_strategy, n_neighbors, n_jobs) - elif method == 'NeigbourhoodCleaningRule': - self.init_NeigbourhoodCleaningRule(sampling_strategy, n_neighbors, - n_jobs, threshold_cleaning) + self.init_NearMiss(sampling_strategy, n_jobs) + elif method == 'NeighbourhoodCleaningRule': + self.init_NeighbourhoodCleaningRule(sampling_strategy, n_neighbors, + n_jobs, threshold_cleaning) elif method == 'RandomOverSampling': self.init_RandomOverSampling(sampling_strategy) elif method == 'ADASYN': - self.init_ADASYN(sampling_strategy, ratio, n_neighbors, n_jobs) + self.init_ADASYN(sampling_strategy, n_neighbors, n_jobs) elif method == 'BorderlineSMOTE': - self.init_BorderlineSMOTE(ratio, k_neighbors, kind, n_jobs) + self.init_BorderlineSMOTE(k_neighbors, n_jobs) elif method == 'SMOTE': - self.init_SMOTE(ratio, k_neighbors, kind, n_jobs) + self.init_SMOTE(k_neighbors, n_jobs) elif method == 'SMOTEENN': - self.init_SMOTEENN(sampling_strategy, ratio, k_neighbors, kind, - n_jobs) + self.init_SMOTEENN(sampling_strategy) elif method == 'SMOTETomek': - self.init_SMOTETomek(sampling_strategy, ratio, n_jobs) + self.init_SMOTETomek(sampling_strategy) else: raise ae.WORCKeyError(f'{method} is not a valid sampling method!') @@ -85,24 +84,23 @@ def init_RandomUnderSampling(self, sampling_strategy): def init_NearMiss(self, sampling_strategy, n_jobs): """Creata a near miss sampler object.""" self.object = under_sampling.NearMiss(sampling_strategy=sampling_strategy, - random_state=self.random_state, n_jobs=n_jobs) self.sampling_strategy = sampling_strategy self.n_jobs = n_jobs - def init_NeigbourhoodCleaningRule(self, sampling_strategy, n_neighbors, - n_jobs, threshold_cleaning): - """Creata a NeigbourhoodCleaningRule sampler object.""" + def init_NeighbourhoodCleaningRule(self, sampling_strategy, n_neighbors, + n_jobs, threshold_cleaning): + """Creata a NeighbourhoodCleaningRule sampler object.""" self.object =\ - under_sampling.NeigbourhoodCleaningRule(sampling_strategy=sampling_strategy, - random_state=self.random_state, - threshold_cleaning=threshold_cleaning, - n_jobs=n_jobs) + under_sampling.NeighbourhoodCleaningRule(sampling_strategy=sampling_strategy, + threshold_cleaning=threshold_cleaning, + n_jobs=n_jobs) self.sampling_strategy = sampling_strategy - self.threshold_cleaning = threshold_cleaning + self.n_neighbors = n_neighbors self.n_jobs = n_jobs + self.threshold_cleaning = threshold_cleaning def init_RandomOverSampling(self, sampling_strategy): """Creata a random over sampler object.""" @@ -110,76 +108,78 @@ def init_RandomOverSampling(self, sampling_strategy): random_state=self.random_state) self.sampling_strategy = sampling_strategy - def init_ADASYN(self, sampling_strategy, ratio, n_neighbors, n_jobs): + def init_ADASYN(self, sampling_strategy, n_neighbors, n_jobs): """Creata a ADASYN sampler object.""" self.object = over_sampling.ADASYN(sampling_strategy=sampling_strategy, random_state=self.random_state, - ratio=ratio, n_neighbors=n_neighbors, n_jobs=n_jobs) self.sampling_strategy = sampling_strategy - self.ratio = ratio self.n_neighbors = n_neighbors self.n_jobs = n_jobs - def init_BorderlineSMOTE(self, ratio, k_neighbors, kind, n_jobs): + def init_BorderlineSMOTE(self, k_neighbors, n_jobs): """Creata a BorderlineSMOTE sampler object.""" self.object =\ over_sampling.BorderlineSMOTE(random_state=self.random_state, - ratio=ratio, k_neighbors=k_neighbors, - kind=kind, n_jobs=n_jobs) - self.ratio = ratio self.k_neighbors = k_neighbors - self.kind = kind self.n_jobs = n_jobs - def init_SMOTE(self, ratio, k_neighbors, kind, n_jobs): + def init_SMOTE(self, k_neighbors, n_jobs): """Creata a SMOTE sampler object.""" - sm = SMOTE(random_state=self.random_state, - ratio=ratio, - k_neighbors=k_neighbors, - kind=kind, - n_jobs=n_jobs) - - self.object = sm + self.object =\ + over_sampling.SMOTE(random_state=self.random_state, + k_neighbors=k_neighbors, + n_jobs=n_jobs) - self.ratio = ratio self.k_neighbors = k_neighbors - self.kind = kind self.n_jobs = n_jobs - def init_SMOTEEN(self, sampling_strategy, ratio, n_jobs): + def init_SMOTEENN(self, sampling_strategy): """Creata a SMOTEEN sampler object.""" self.object =\ combine.SMOTEENN(random_state=self.random_state, - sampling_strategy=sampling_strategy, - ratio=ratio, - n_jobs=n_jobs) + sampling_strategy=sampling_strategy) - self.ratio = ratio self.sampling_strategy = sampling_strategy - self.n_jobs = n_jobs - def init_SMOTETomek(self, sampling_strategy, ratio, n_jobs): + def init_SMOTETomek(self, sampling_strategy): """Creata a SMOTE Tomek sampler object.""" self.object =\ combine.SMOTETomek(random_state=self.random_state, - sampling_strategy=sampling_strategy, - ratio=ratio, - n_jobs=n_jobs) + sampling_strategy=sampling_strategy) - self.ratio = ratio self.sampling_strategy = sampling_strategy - self.n_jobs = n_jobs - def fit(self, **kwargs): + def fit(self, *args, **kwargs): """Fit a sampler object.""" - self.object.fit(**kwargs) + if hasattr(self.object, 'fit_resample'): + if self.verbose: + print('[WORC WARNING] Sampler does have fit_resample construction: not fitting now.') + else: + # Object has a fit-transform construction + self.object.fit(*args, **kwargs) - def transform(self, **kwargs): + def transform(self, *args, **kwargs): """Transform objects with a fitted sampler.""" - return self.object.transform(**kwargs) + if hasattr(self.object, 'fit_resample'): + if self.verbose: + print('[WORC WARNING] Sampler does have fit_resample construction: fit and resampling.') + try: + return self.object.fit_resample(*args, **kwargs) + except ValueError as message: + message = str(message) + message = 'The ObjectSampler could not ' +\ + 'resample the objects with ' +\ + 'the given parameters. ' +\ + 'Probably your number of samples ' +\ + 'is too small for the parameters ' +\ + 'used. Original error: ' + message + raise ae.WORCValueError(message) + + else: + return self.object.transform(*args, **kwargs) diff --git a/WORC/classification/SearchCV.py b/WORC/classification/SearchCV.py index 80a50697..2fb344b5 100644 --- a/WORC/classification/SearchCV.py +++ b/WORC/classification/SearchCV.py @@ -24,7 +24,7 @@ from WORC.classification.metrics import check_scoring from sklearn.model_selection._split import check_cv from scipy.stats import rankdata -from sklearn.externals import six +import six from sklearn.utils.fixes import MaskedArray from sklearn.model_selection._search import ParameterSampler @@ -45,6 +45,11 @@ from joblib import Parallel, delayed from WORC.classification.fitandscore import fit_and_score, replacenan from WORC.classification.fitandscore import delete_nonestimator_parameters +from sklearn.utils.validation import _check_fit_params +from sklearn.utils.validation import _num_samples +from sklearn.model_selection._validation import _aggregate_score_dicts +from WORC.classification.metrics import check_multimetric_scoring +from sklearn.metrics._scorer import _MultimetricScorer import WORC.addexceptions as WORCexceptions import pandas as pd import json @@ -57,6 +62,7 @@ from WORC.classification.estimators import RankedSVM from WORC.classification import construct_classifier as cc from WORC.featureprocessing.Preprocessor import Preprocessor +from WORC.detectors.detectors import DebugDetector def rms_score(truth, prediction): @@ -566,10 +572,19 @@ def inverse_transform(self, Xt): return self.best_estimator_.transform(Xt) def preprocess(self, X, y=None, training=False): - '''Apply the available preprocssing methods to the features''' + """Apply the available preprocssing methods to the features.""" if self.best_preprocessor is not None: X = self.best_preprocessor.transform(X) + if self.best_imputer is not None: + X = self.best_imputer.transform(X) + + # Replace nan if still left + X = replacenan(np.asarray(X)).tolist() + + if self.best_groupsel is not None: + X = self.best_groupsel.transform(X) + if not training and hasattr(self, 'overfit_scaler') and self.overfit_scaler: # Overfit the feature scaling on the test set # NOTE: Never use this in an actual model, only to assess how @@ -586,15 +601,6 @@ def preprocess(self, X, y=None, training=False): if self.best_scaler is not None: X = self.best_scaler.transform(X) - if self.best_imputer is not None: - X = self.best_imputer.transform(X) - - # Replace nan if still left - X = replacenan(np.asarray(X)).tolist() - - if self.best_groupsel is not None: - X = self.best_groupsel.transform(X) - if self.best_varsel is not None: X = self.best_varsel.transform(X) @@ -610,40 +616,34 @@ def preprocess(self, X, y=None, training=False): if self.best_statisticalsel is not None: X = self.best_statisticalsel.transform(X) - # Only oversample in training phase, i.e. if we have the labels + # Only resampling in training phase, i.e. if we have the labels if y is not None: - if self.best_SMOTE is not None: - X, y = self.best_SMOTE.fit_sample(X, y) - - if self.best_RandomOverSampler is not None: - X, y = self.best_RandomOverSampler.fit_sample(X, y) + if self.best_Sampler is not None: + X, y = self.best_Sampler.transform(X, y) return X, y - @property - def best_params_(self): - check_is_fitted(self, 'cv_results_') - return self.cv_results_['params_all'][self.best_index_] - - @property - def best_score_(self): - check_is_fitted(self, 'cv_results_') - return self.cv_results_['mean_test_score'][self.best_index_] - - def process_fit(self, n_splits, parameters_est, parameters_all, - test_sample_counts, test_scores, - train_scores, fit_time, score_time, cv_iter, + def process_fit(self, n_splits, parameters_all, + test_sample_counts, test_score_dicts, + train_score_dicts, fit_time, score_time, cv_iter, X, y): """ Process the outcomes of a SearchCV fit and find the best settings over all cross validations from all hyperparameters tested + Very similar to the _format_results function or the original SearchCV. + """ + # test_score_dicts and train_score dicts are lists of dictionaries and + # we make them into dict of lists + test_scores = _aggregate_score_dicts(test_score_dicts) + if self.return_train_score: + train_scores = _aggregate_score_dicts(train_score_dicts) + # We take only one result per split, default by sklearn - candidate_params_est = list(parameters_est[::n_splits]) candidate_params_all = list(parameters_all[::n_splits]) - n_candidates = len(candidate_params_est) + n_candidates = len(candidate_params_all) # Computed the (weighted) mean and std for test scores alone # NOTE test_sample counts (weights) remain the same for all candidates @@ -692,13 +692,37 @@ def _store(key_name, array, weights=None, splits=False, rank=False): results["rank_%s" % key_name] = np.asarray( rankdata(-array_means, method='min'), dtype=np.int32) - _store('test_score', test_scores, splits=True, rank=True, - weights=test_sample_counts if self.iid else None) - if self.return_train_score: - _store('train_score', train_scores, splits=True) _store('fit_time', fit_time) _store('score_time', score_time) + # Store scores + # Check whether to do multimetric scoring + test_estimator = cc.construct_classifier(candidate_params_all[0]) + scorers, self.multimetric_ = check_multimetric_scoring( + test_estimator, scoring=self.scoring) + + # NOTE test_sample counts (weights) remain the same for all candidates + test_sample_counts = np.array(test_sample_counts[:n_splits], + dtype=np.int) + + if self.iid != 'deprecated': + warnings.warn( + "The parameter 'iid' is deprecated in 0.22 and will be " + "removed in 0.24.", FutureWarning + ) + iid = self.iid + else: + iid = False + + for scorer_name in scorers.keys(): + # Computed the (weighted) mean and std for test scores alone + _store('test_%s' % scorer_name, test_scores[scorer_name], + splits=True, rank=True, + weights=test_sample_counts if iid else None) + if self.return_train_score: + _store('train_%s' % scorer_name, train_scores[scorer_name], + splits=True) + # Compute the "Generalization" score difference_score = abs(results['mean_train_score'] - results['mean_test_score']) generalization_score = results['mean_test_score'] - difference_score @@ -706,6 +730,45 @@ def _store(key_name, array, weights=None, splits=False, rank=False): results['rank_generalization_score'] = np.asarray( rankdata(-results['generalization_score'], method='min'), dtype=np.int32) + if self.multimetric_: + if self.refit is not False and ( + not isinstance(self.refit, str) or + # This will work for both dict / list (tuple) + self.refit not in scorers) and not callable(self.refit): + raise ValueError("For multi-metric scoring, the parameter " + "refit must be set to a scorer key or a " + "callable to refit an estimator with the " + "best parameter setting on the whole " + "data and make the best_* attributes " + "available for that metric. If this is " + "not needed, refit should be set to " + "False explicitly. %r was passed." + % self.refit) + else: + refit_metric = self.refit + else: + refit_metric = 'score' + + # For multi-metric evaluation, store the best_index_, best_params_ and + # best_score_ iff refit is one of the scorer names + # In single metric evaluation, refit_metric is "score" + if self.refit or not self.multimetric_: + # If callable, refit is expected to return the index of the best + # parameter set. + if callable(self.refit): + self.best_index_ = self.refit(results) + if not isinstance(self.best_index_, numbers.Integral): + raise TypeError('best_index_ returned is not an integer') + if (self.best_index_ < 0 or + self.best_index_ >= len(results["params"])): + raise IndexError('best_index_ index out of range') + else: + self.best_index_ = results["rank_test_%s" + % refit_metric].argmin() + self.best_score_ = results["mean_test_%s" % refit_metric][ + self.best_index_] + self.best_params_ = candidate_params_all[self.best_index_] + # Rank the indices of scores from all parameter settings ranked_test_scores = results["rank_" + self.ranking_score] indices = range(0, len(ranked_test_scores)) @@ -716,48 +779,35 @@ def _store(key_name, array, weights=None, splits=False, rank=False): maxlen = min(self.maxlen, n_candidates) bestindices = sortedindices[0:maxlen] - candidate_params_est = np.asarray(candidate_params_est)[bestindices].tolist() candidate_params_all = np.asarray(candidate_params_all)[bestindices].tolist() for k in results.keys(): results[k] = results[k][bestindices] - n_candidates = len(candidate_params_est) + n_candidates = len(candidate_params_all) + results['params'] = candidate_params_all # Store the atributes of the best performing estimator best_index = np.flatnonzero(results["rank_" + self.ranking_score] == 1)[0] - best_parameters_est = candidate_params_est[best_index] best_parameters_all = candidate_params_all[best_index] - # Use one MaskedArray and mask all the places where the param is not - # applicable for that candidate. Use defaultdict as each candidate may - # not contain all the params - param_results = defaultdict(partial(MaskedArray, - np.empty(n_candidates,), - mask=True, - dtype=object)) - for cand_i, params in enumerate(candidate_params_all): - for name, value in params.items(): - # An all masked empty array gets created for the key - # `"param_%s" % name` at the first occurence of `name`. - # Setting the value at an index also unmasks that index - param_results["param_%s" % name][cand_i] = value - - # Store a list of param dicts at the key 'params' - results['params'] = candidate_params_est - results['params_all'] = candidate_params_all - + # Store several objects self.cv_results_ = results - self.best_index_ = best_index self.n_splits_ = n_splits self.cv_iter = cv_iter + self.best_index_ = best_index + self.best_params_ = results["params"][self.best_index_] - # Refit all objects with best settings on the full dataset - indices = range(0, len(y)) - self.refit_and_score(X, y, best_parameters_all, best_parameters_est, - train=indices, test=indices) + if self.refit: + # We always refit on the full dataset + indices = np.arange(0, len(y)) + self.refit_and_score(X, y, best_parameters_all, + train=indices, test=indices) + + # Store the only scorer not as a dict for single metric evaluation + self.scorer_ = scorers if self.multimetric_ else scorers['score'] return self - def refit_and_score(self, X, y, parameters_all, parameters_est, + def refit_and_score(self, X, y, parameters_all, train, test, verbose=None): """Refit the base estimator and attributes such as GroupSel @@ -775,9 +825,6 @@ def refit_and_score(self, X, y, parameters_all, parameters_est, and the fitting. TODO: Create a default object and show the fields. - parameters_est: dictionary, mandatory - Contains the settings used for the base estimator - train: list, mandatory Indices of the objects to be used as training set. @@ -809,19 +856,21 @@ def refit_and_score(self, X, y, parameters_all, parameters_est, preprocessor = None # Refit all preprocessing functions + fit_params = _check_fit_params(X, self.fit_params) out = fit_and_score(X_fit, y, self.scoring, train, test, parameters_all, - fit_params=self.fit_params, + fit_params=fit_params, return_train_score=self.return_train_score, return_n_test_samples=True, - return_times=True, return_parameters=True, + return_times=True, return_parameters=False, + return_estimator=False, error_score=self.error_score, verbose=verbose, return_all=True) # Associate best options with new fits (save_data, GroupSel, VarSel, SelectModel, feature_labels, scalers,\ - Imputers, PCAs, StatisticalSel, ReliefSel, sm, ros) = out + Imputers, PCAs, StatisticalSel, ReliefSel, Sampler) = out self.best_groupsel = GroupSel self.best_scaler = scalers self.best_varsel = VarSel @@ -832,14 +881,12 @@ def refit_and_score(self, X, y, parameters_all, parameters_est, self.best_featlab = feature_labels self.best_statisticalsel = StatisticalSel self.best_reliefsel = ReliefSel - self.best_SMOTE = sm - self.best_RandomOverSampler = ros + self.best_Sampler = Sampler # Fit the estimator using the preprocessed features X = [x[0] for x in X] X, y = self.preprocess(X, y, training=True) - parameters_est = delete_nonestimator_parameters(parameters_est) best_estimator = cc.construct_classifier(parameters_all) # NOTE: This just has to go to the construct classifier function, @@ -917,9 +964,8 @@ def compute_performance(scoring, Y_valid_truth, Y_valid_score): scoring = self.scoring # Get settings for best 100 estimators - parameters_est = self.cv_results_['params'] - parameters_all = self.cv_results_['params_all'] - n_classifiers = len(parameters_est) + parameters_all = self.cv_results_['params'] + n_classifiers = len(parameters_all) n_iter = len(self.cv_iter) # Create a new base object for the ensemble components @@ -956,16 +1002,16 @@ def compute_performance(scoring, Y_valid_truth, Y_valid_score): Y_valid_score_it = np.zeros((n_classifiers, len(valid))) # Loop over the 100 best estimators - for num, (p_est, p_all) in enumerate(zip(parameters_est, parameters_all)): + for num, p_all in enumerate(parameters_all): # NOTE: Explicitly exclude validation set, elso refit and score # somehow still seems to use it. X_train_temp = [X_train[i] for i in train] Y_train_temp = [Y_train[i] for i in train] - train_temp = range(0, len(train)) + train_temp = np.arange(0, len(train)) # Refit a SearchCV object with the provided parameters base_estimator.refit_and_score(X_train_temp, Y_train_temp, p_all, - p_est, train_temp, train_temp, + train_temp, train_temp, verbose=False) # Predict and save scores @@ -1070,11 +1116,9 @@ def compute_performance(scoring, Y_valid_truth, Y_valid_score): print('Creating ensemble with Caruana method.') # BUG: kernel parameter is sometimes saved in unicode - for i in range(0, len(parameters_est)): - kernel = str(parameters_est[i][u'kernel']) - del parameters_est[i][u'kernel'] + for i in range(0, len(parameters_all)): + kernel = str(parameters_all[i][u'kernel']) del parameters_all[i][u'kernel'] - parameters_est[i]['kernel'] = kernel parameters_all[i]['kernel'] = kernel # In order to speed up the process, we precompute all scores of the possible @@ -1092,16 +1136,16 @@ def compute_performance(scoring, Y_valid_truth, Y_valid_score): Y_valid_score_it = np.zeros((n_classifiers, len(valid))) # Loop over the 100 best estimators - for num, (p_est, p_all) in enumerate(zip(parameters_est, parameters_all)): + for num, p_all in enumerate(parameters_all): # NOTE: Explicitly exclude validation set, elso refit and score # somehow still seems to use it. X_train_temp = [X_train[i] for i in train] Y_train_temp = [Y_train[i] for i in train] - train_temp = range(0, len(train)) + train_temp = np.arange(0, len(train)) # Refit a SearchCV object with the provided parameters base_estimator.refit_and_score(X_train_temp, Y_train_temp, p_all, - p_est, train_temp, train_temp, + train_temp, train_temp, verbose=False) # Predict and save scores @@ -1267,12 +1311,11 @@ def compute_performance(scoring, Y_valid_truth, Y_valid_score): # Create the ensemble -------------------------------------------------- # Create the ensemble trained on the full training set - parameters_est = [parameters_est[i] for i in ensemble] parameters_all = [parameters_all[i] for i in ensemble] estimators = list() - train = range(0, len(X_train)) + train = np.arange(0, len(X_train)) nest = len(ensemble) - for enum, (p_est, p_all) in enumerate(zip(parameters_est, parameters_all)): + for enum, p_all in enumerate(parameters_all): # Refit a SearchCV object with the provided parameters print(f"Refitting estimator {enum+1} / {nest}.") base_estimator = clone(base_estimator) @@ -1283,7 +1326,7 @@ def compute_performance(scoring, Y_valid_truth, Y_valid_score): # base_estimator = OneVsRestClassifier(base_estimator) base_estimator.refit_and_score(X_train, Y_train, p_all, - p_est, train, train, + train, train, verbose=False) # Determine whether to overfit the feature scaling on the test set @@ -1306,6 +1349,7 @@ def _fit(self, X, y, groups, parameter_iterable): isclassifier =\ not any(clf in regressors for clf in self.param_distributions['classifiers']) + # Check the cross-validation object and do the splitting cv = check_cv(self.cv, y, classifier=isclassifier) X, y, groups = indexable(X, y, groups) @@ -1315,7 +1359,27 @@ def _fit(self, X, y, groups, parameter_iterable): print(f"Fitting {n_splits} folds for each of {n_candidates} candidates, totalling {n_candidates * n_splits} fits.") cv_iter = list(cv.split(X, y, groups)) - name = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) + + # NOTE: We do not check the scoring here, as this can differ + # per estimator. Thus, this is done inside the fit and scoring + + # Check fitting parameters + fit_params = _check_fit_params(X, self.fit_params) + + # Create temporary directory for fastr + if DebugDetector().do_detection(): + # Specific name for easy debugging + debugnum = 0 + name = 'DEBUG_' + str(debugnum) + tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name) + while os.path.exists(tempfolder): + debugnum += 1 + name = 'DEBUG_' + str(debugnum) + tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name) + + else: + name = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(10)) + tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name) if not os.path.exists(tempfolder): os.makedirs(tempfolder) @@ -1408,12 +1472,20 @@ def _fit(self, X, y, groups, parameter_iterable): 'verbose', 'fit_params', 'return_train_score', 'return_n_test_samples', 'return_times', 'return_parameters', + 'return_estimator', 'error_score'] + verbose = False + return_n_test_samples = True + return_times = True + return_parameters = False + return_estimator = False estimator_data = pd.Series([X, y, self.scoring, - False, - self.fit_params, self.return_train_score, - True, True, True, + verbose, fit_params, + self.return_train_score, + return_n_test_samples, return_times, + return_parameters, + return_estimator, self.error_score], index=estimator_labels, name='estimator Data') @@ -1473,23 +1545,24 @@ def _fit(self, X, y, groups, parameter_iterable): # if one choose to see train score, "out" will contain train score info if self.return_train_score: (train_scores, test_scores, test_sample_counts, - fit_time, score_time, parameters_est, parameters_all) =\ + fit_time, score_time, parameters_all) =\ zip(*save_data) else: (test_scores, test_sample_counts, - fit_time, score_time, parameters_est, parameters_all) =\ + fit_time, score_time, parameters_all) =\ zip(*save_data) # Remove the temporary folder used - shutil.rmtree(tempfolder) + if name != 'DEBUG_0': + # Do delete if not debugging for first iteration + shutil.rmtree(tempfolder) # Process the results of the fitting procedure self.process_fit(n_splits=n_splits, - parameters_est=parameters_est, parameters_all=parameters_all, test_sample_counts=test_sample_counts, - test_scores=test_scores, - train_scores=train_scores, + test_score_dicts=test_scores, + train_score_dicts=train_scores, fit_time=fit_time, score_time=score_time, cv_iter=cv_iter, @@ -1753,6 +1826,7 @@ def _fit(self, X, y, groups, parameter_iterable): isclassifier =\ not any(clf in regressors for clf in self.param_distributions['classifiers']) + # Check the cross-validation object and do the splitting cv = check_cv(self.cv, y, classifier=isclassifier) X, y, groups = indexable(X, y, groups) @@ -1766,6 +1840,9 @@ def _fit(self, X, y, groups, parameter_iterable): pre_dispatch = self.pre_dispatch cv_iter = list(cv.split(X, y, groups)) + # Check fitting parameters + fit_params = _check_fit_params(X, self.fit_params) + # Draw parameter sample for num, parameters in enumerate(parameter_iterable): parameter_sample = parameters @@ -1788,10 +1865,11 @@ def _fit(self, X, y, groups, parameter_iterable): pre_dispatch=pre_dispatch )(delayed(fit_and_score)(X, y, self.scoring, train, test, parameters, - fit_params=self.fit_params, + fit_params=fit_params, return_train_score=self.return_train_score, return_n_test_samples=True, - return_times=True, return_parameters=True, + return_times=True, return_parameters=False, + return_estimator=False, error_score=self.error_score, verbose=False, return_all=False) @@ -1802,19 +1880,18 @@ def _fit(self, X, y, groups, parameter_iterable): # if one choose to see train score, "out" will contain train score info if self.return_train_score: (train_scores, test_scores, test_sample_counts, - fit_time, score_time, parameters_est, parameters_all) =\ + fit_time, score_time, parameters_all) =\ save_data else: (test_scores, test_sample_counts, - fit_time, score_time, parameters_est, parameters_all) =\ + fit_time, score_time, parameters_all) =\ save_data self.process_fit(n_splits=n_splits, - parameters_est=parameters_est, parameters_all=parameters_all, test_sample_counts=test_sample_counts, - test_scores=test_scores, - train_scores=train_scores, + test_score_dicts=test_scores, + train_score_dicts=train_scores, fit_time=fit_time, score_time=score_time, cv_iter=cv_iter, diff --git a/WORC/classification/construct_classifier.py b/WORC/classification/construct_classifier.py index c9a78638..d730b258 100644 --- a/WORC/classification/construct_classifier.py +++ b/WORC/classification/construct_classifier.py @@ -106,6 +106,7 @@ def construct_classifier(config): # Logistic Regression classifier = LogisticRegression(max_iter=max_iter, penalty=config['LRpenalty'], + solver='lbfgs', C=config['LRC'], random_state=config['random_seed']) elif config['classifiers'] == 'GaussianNB': diff --git a/WORC/classification/crossval.py b/WORC/classification/crossval.py index 8d5e2c38..11e2a506 100644 --- a/WORC/classification/crossval.py +++ b/WORC/classification/crossval.py @@ -361,7 +361,7 @@ def crossval(config, label_data, image_features, n += 1 filename = os.path.join(tempfolder, 'tempsave_' + str(i + n) + '.hdf5') - panda_data.to_hdf(filename, 'SVMdata') + panda_data.to_hdf(filename, 'EstimatorData') del panda_data, panda_data_temp # Print elapsed time diff --git a/WORC/classification/fitandscore.py b/WORC/classification/fitandscore.py index ad1cdf70..95d1329d 100644 --- a/WORC/classification/fitandscore.py +++ b/WORC/classification/fitandscore.py @@ -24,18 +24,19 @@ from sklearn.decomposition import PCA from sklearn.multiclass import OneVsRestClassifier from sklearn.ensemble import RandomForestClassifier -from imblearn.over_sampling import SMOTE, RandomOverSampler -from sklearn.utils import check_random_state -import random +from WORC.classification.ObjectSampler import ObjectSampler +from sklearn.utils.metaestimators import _safe_split +from sklearn.utils.validation import _num_samples from sklearn.metrics import make_scorer, average_precision_score from WORC.classification.estimators import RankedSVM from WORC.classification import construct_classifier as cc -from WORC.classification.metrics import check_scoring +from WORC.classification.metrics import check_multimetric_scoring from WORC.featureprocessing.Relief import SelectMulticlassRelief from WORC.featureprocessing.Imputer import Imputer from WORC.featureprocessing.VarianceThreshold import selfeat_variance from WORC.featureprocessing.StatisticalTestThreshold import StatisticalTestThreshold from WORC.featureprocessing.SelectGroups import SelectGroups +import WORC.addexceptions as ae # Specific imports for error management from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA @@ -43,26 +44,28 @@ def fit_and_score(X, y, scoring, - train, test, para, + train, test, parameters, fit_params=None, return_train_score=True, return_n_test_samples=True, - return_times=True, return_parameters=True, + return_times=True, return_parameters=False, + return_estimator=False, error_score='raise', verbose=True, return_all=True): - ''' - Fit an estimator to a dataset and score the performance. The following + """Fit an estimator to a dataset and score the performance. + + The following methods can currently be applied as preprocessing before fitting, in this order: 1. Apply feature imputation 2. Select features based on feature type group (e.g. shape, histogram). - 3. Oversampling + 3. Scale features with e.g. z-scoring. 4. Apply feature selection based on variance of feature among patients. 5. Univariate statistical testing (e.g. t-test, Wilcoxon). - 6. Scale features with e.g. z-scoring. - 7. Use Relief feature selection. - 8. Select features based on a fit with a LASSO model. - 9. Select features using PCA. + 6. Use Relief feature selection. + 7. Select features based on a fit with a LASSO model. + 8. Select features using PCA. + 9. Resampling 10. If a SingleLabel classifier is used for a MultiLabel problem, a OneVsRestClassifier is employed around it. @@ -89,7 +92,7 @@ def fit_and_score(X, y, scoring, test: list, mandatory Indices of the objects to be used as testing set. - para: dictionary, mandatory + parameters: dictionary, mandatory Contains the settings used for the above preprocessing functions and the fitting. TODO: Create a default object and show the fields. @@ -112,6 +115,9 @@ def fit_and_score(X, y, scoring, Return the parameters used in the final fit to the final SearchCV object. + return_estimator : bool, default=False + Whether to return the fitted estimator. + error_score: numeric or "raise" by default Value to assign to the score if an error occurs in estimator fitting. If set to "raise", the error is raised. If a numeric @@ -173,22 +179,43 @@ def fit_and_score(X, y, scoring, Either None if the RELIEF feature selection is not used, or the fitted object. - Snote: WORC SMOTE Object - Either None if the SMOTE oversampling is not used, or - the fitted object. + Sampler: WORC ObjectSampler Object + Either None if no resampling is used, or an ObjectSampler object - RandOverSample: WORC RandomOverSampler Object - Either None if Random Oversampling is not used, or - the fitted object. - ''' + """ + # We copy the parameter object so we can alter it and keep the original + if verbose: + print("\n") + print('#######################################') + print('Starting fit and score of new workflow.') + para_estimator = parameters.copy() + estimator = cc.construct_classifier(para_estimator) + + # Check the scorer + scorers, __ = check_multimetric_scoring(estimator, scoring=scoring) + + para_estimator = delete_cc_para(para_estimator) + + # Get random seed from parameters + random_seed = para_estimator['random_seed'] + del para_estimator['random_seed'] + + # X is a tuple: split in two arrays + feature_values = np.asarray([x[0] for x in X]) + feature_labels = np.asarray([x[1] for x in X]) + + # Split in train and testing + X_train, y_train = _safe_split(estimator, feature_values, y, train) + X_test, y_test = _safe_split(estimator, feature_values, y, test, train) + train = np.arange(0, len(y_train)) + test = np.arange(len(y_train), len(y_train) + len(y_test)) + # Set some defaults for if a part fails and we return a dummy test_sample_counts = len(test) fit_time = np.inf score_time = np.inf - train_score = np.nan - test_score = np.nan - Smote = None + Sampler = None imputer = None scaler = None GroupSel = None @@ -197,37 +224,32 @@ def fit_and_score(X, y, scoring, StatisticalSel = None VarSel = None ReliefSel = None - RandOverSample = None - - if return_train_score: - ret = [train_score, test_score, test_sample_counts, - fit_time, score_time, para, para] + if isinstance(scorers, dict): + test_scores = {name: np.nan for name in scorers} + if return_train_score: + train_scores = test_scores.copy() else: - ret = [test_score, test_sample_counts, - fit_time, score_time, para, para] + test_scores = error_score + if return_train_score: + train_scores = error_score - # We copy the parameter object so we can alter it and keep the original - if verbose: - print("\n") - print('#######################################') - print('Starting fit and score of new workflow.') - para_estimator = para.copy() - estimator = cc.construct_classifier(para_estimator) - if scoring != 'average_precision_weighted': - scorer = check_scoring(estimator, scoring=scoring) - else: - scorer = make_scorer(average_precision_score, average='weighted') + # Initiate dummy return object for when fit and scoring failes: sklearn defaults + ret = [train_scores, test_scores] if return_train_score else [test_scores] - para_estimator = delete_cc_para(para_estimator) + # ret = [train_scores, test_scores, test_sample_counts, + # fit_time, score_time, para_estimator, para] - # Get random seed from parameters - random_seed = para_estimator['random_seed'] - random_state = check_random_state(random_seed) - del para_estimator['random_seed'] + if return_n_test_samples: + ret.append(_num_samples(X_test)) + if return_times: + ret.extend([fit_time, score_time]) + if return_parameters: + ret.append(para_estimator) + if return_estimator: + ret.append(estimator) - # X is a tuple: split in two arrays - feature_values = np.asarray([x[0] for x in X]) - feature_labels = np.asarray([x[1] for x in X]) + # Additional to sklearn defaults: return all parameters + ret.append(parameters) # ------------------------------------------------------------------------ # Feature imputation @@ -240,10 +262,10 @@ def fit_and_score(X, y, scoring, imputer = Imputer(missing_values=np.nan, strategy=imp_type, n_neighbors=imp_nn) - imputer.fit(feature_values) - feature_values = imputer.transform(feature_values) + imputer.fit(X_train) + X_train = imputer.transform(X_train) + X_test = imputer.transform(X_test) - if 'Imputation' in para_estimator.keys(): del para_estimator['Imputation'] del para_estimator['ImputationMethod'] del para_estimator['ImputationNeighbours'] @@ -253,28 +275,8 @@ def fit_and_score(X, y, scoring, del imputer # Remove any NaN feature values if these are still left after imputation - feature_values = replacenan(feature_values, verbose=verbose, feature_labels=feature_labels[0]) - - # ------------------------------------------------------------------------ - # Feature scaling - if 'FeatureScaling' in para_estimator.keys(): - if verbose: - print("Fitting scaler and transforming features.") - - if para_estimator['FeatureScaling'] == 'z_score': - scaler = StandardScaler().fit(feature_values) - elif para_estimator['FeatureScaling'] == 'robust': - scaler = RobustScaler().fit(feature_values) - elif para_estimator['FeatureScaling'] == 'minmax': - scaler = MinMaxScaler().fit(feature_values) - - if scaler is not None: - feature_values = scaler.transform(feature_values) - del para_estimator['FeatureScaling'] - - # Delete the object if we do not need to return it - if not return_all: - del scaler + X_train = replacenan(X_train, verbose=verbose, feature_labels=feature_labels[0]) + X_test = replacenan(X_test, verbose=verbose, feature_labels=feature_labels[0]) # ------------------------------------------------------------------------ # Groupwise feature selection @@ -327,22 +329,26 @@ def fit_and_score(X, y, scoring, parameters_featsel[group] = value + # Fit groupwise feature selection object GroupSel = SelectGroups(parameters=parameters_featsel, toolboxes=toolboxes) GroupSel.fit(feature_labels[0]) if verbose: - print("Original Length: " + str(len(feature_values[0]))) - feature_values = GroupSel.transform(feature_values) + print("\t Original Length: " + str(len(X_train[0]))) + + # Transform all objectd accordingly + X_train = GroupSel.transform(X_train) if verbose: - print("New Length: " + str(len(feature_values[0]))) + print("\t New Length: " + str(len(X_train[0]))) feature_labels = GroupSel.transform(feature_labels) + X_test = GroupSel.transform(X_test) # Delete the object if we do not need to return it if not return_all: del GroupSel # Check whether there are any features left - if len(feature_values[0]) == 0: + if len(X_train[0]) == 0: # TODO: Make a specific WORC exception for this warning. if verbose: print('[WARNING]: No features are selected! Probably all feature groups were set to False. Parameters:') @@ -352,19 +358,34 @@ def fit_and_score(X, y, scoring, para_estimator = delete_nonestimator_parameters(para_estimator) if return_all: - return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, imputer, pca, StatisticalSel, ReliefSel, Smote, RandOverSample + return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, imputer, pca, StatisticalSel, ReliefSel, Sampler else: return ret # ------------------------------------------------------------------------ - # FIXME: When only using LBP feature, X is 3 dimensional with 3rd dimension length 1 - if len(feature_values.shape) == 3: - feature_values = np.reshape(feature_values, (feature_values.shape[0], feature_values.shape[1])) - if len(feature_labels.shape) == 3: - feature_labels = np.reshape(feature_labels, (feature_labels.shape[0], feature_labels.shape[1])) + # Feature scaling + if 'FeatureScaling' in para_estimator.keys(): + if verbose: + print("Fitting scaler and transforming features.") - # Remove any NaN feature values if these are still left after imputation - feature_values = replacenan(feature_values, verbose=verbose, feature_labels=feature_labels[0]) + if para_estimator['FeatureScaling'] == 'z_score': + scaler = StandardScaler().fit(X_train) + elif para_estimator['FeatureScaling'] == 'robust': + scaler = RobustScaler().fit(X_train) + elif para_estimator['FeatureScaling'] == 'minmax': + scaler = MinMaxScaler().fit(X_train) + else: + scaler = None + + if scaler is not None: + X_train = scaler.transform(X_train) + X_test = scaler.transform(X_test) + + del para_estimator['FeatureScaling'] + + # Delete the object if we do not need to return it + if not return_all: + del scaler # -------------------------------------------------------------------- # Feature selection based on variance @@ -372,15 +393,16 @@ def fit_and_score(X, y, scoring, if verbose: print("Selecting features based on variance.") if verbose: - print("Original Length: " + str(len(feature_values[0]))) + print("\t Original Length: " + str(len(X_train[0]))) try: - feature_values, feature_labels, VarSel =\ - selfeat_variance(feature_values, feature_labels) + X_train, feature_labels, VarSel =\ + selfeat_variance(X_train, feature_labels) + X_test = VarSel.transform(X_test) except ValueError: if verbose: print('[WARNING]: No features meet the selected Variance threshold! Skipping selection.') if verbose: - print("New Length: " + str(len(feature_values[0]))) + print("\t New Length: " + str(len(X_train[0]))) del para_estimator['Featsel_Variance'] @@ -389,37 +411,15 @@ def fit_and_score(X, y, scoring, del VarSel # Check whether there are any features left - if len(feature_values[0]) == 0: - # TODO: Make a specific WORC exception for this warning. - if verbose: - print('[WARNING]: No features are selected! Probably you selected a feature group that is not in your feature file. Parameters:') - print(para) - para_estimator = delete_nonestimator_parameters(para_estimator) - - # Return a zero performance dummy - ret = [train_score, test_score, test_sample_counts, - fit_time, score_time, para_estimator, para] - - if return_all: - return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, imputer, pca, StatisticalSel, ReliefSel, Smote, RandOverSample - else: - return ret - - # Check whether there are any features left - if len(feature_values[0]) == 0: + if len(X_train[0]) == 0: # TODO: Make a specific WORC exception for this warning. if verbose: - print('[WORC WARNING]: No features are selected! Probably you selected a feature group that is not in your feature file. Parameters:') - print(para) - + print('[WARNING]: No features are selected! Probably your features have too little variance. Parameters:') + print(parameters) para_estimator = delete_nonestimator_parameters(para_estimator) - # Return a zero performance dummy - ret = [train_score, test_score, test_sample_counts, - fit_time, score_time, para_estimator, para] - if return_all: - return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, imputer, pca, StatisticalSel, ReliefSel, Smote, RandOverSample + return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, imputer, pca, StatisticalSel, ReliefSel, Sampler else: return ret @@ -438,31 +438,46 @@ def fit_and_score(X, y, scoring, distance_p = para_estimator['ReliefDistanceP'] numf = para_estimator['ReliefNumFeatures'] + # Fit RELIEF object ReliefSel = SelectMulticlassRelief(n_neighbours=n_neighbours, sample_size=sample_size, distance_p=distance_p, numf=numf, random_state=random_seed) - ReliefSel.fit(feature_values, y) + ReliefSel.fit(X_train, y) if verbose: - print("Original Length: " + str(len(feature_values[0]))) - feature_values = ReliefSel.transform(feature_values) + print("\t Original Length: " + str(len(X_train[0]))) + + # Transform all objects accordingly + X_train = ReliefSel.transform(X_train) if verbose: - print("New Length: " + str(len(feature_values[0]))) + print("\t New Length: " + str(len(X_train[0]))) feature_labels = ReliefSel.transform(feature_labels) - feature_labels.sort() - - # Delete the object if we do not need to return it - if not return_all: - del ReliefSel + X_test = ReliefSel.transform(X_test) - if 'ReliefUse' in para_estimator.keys(): del para_estimator['ReliefUse'] del para_estimator['ReliefNN'] del para_estimator['ReliefSampleSize'] del para_estimator['ReliefDistanceP'] del para_estimator['ReliefNumFeatures'] + # Delete the object if we do not need to return it + if not return_all: + del ReliefSel + + # Check whether there are any features left + if len(X_train[0]) == 0: + # TODO: Make a specific WORC exception for this warning. + if verbose: + print('[WARNING]: No features are selected! Probably RELIEF could not properly select features. Parameters:') + print(parameters) + para_estimator = delete_nonestimator_parameters(para_estimator) + + if return_all: + return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, imputer, pca, StatisticalSel, ReliefSel, Sampler + else: + return ret + # ------------------------------------------------------------------------ # Perform feature selection using a model if 'SelectFromModel' in para_estimator.keys() and para_estimator['SelectFromModel'] == 'True': @@ -476,16 +491,17 @@ def fit_and_score(X, y, scoring, # Create and fit lasso model lassomodel = Lasso(alpha=alpha) - lassomodel.fit(feature_values, y) + lassomodel.fit(X_train, y) # Use fit to select optimal features SelectModel = SelectFromModel(lassomodel, prefit=True) if verbose: - print("Original Length: " + str(len(feature_values[0]))) - feature_values = SelectModel.transform(feature_values) + print("\t Original Length: " + str(len(X_train[0]))) + X_train = SelectModel.transform(X_train) if verbose: - print("New Length: " + str(len(feature_values[0]))) + print("\t New Length: " + str(len(X_train[0]))) feature_labels = SelectModel.transform(feature_labels) + X_test = SelectModel.transform(X_test) if 'SelectFromModel' in para_estimator.keys(): del para_estimator['SelectFromModel'] @@ -494,25 +510,37 @@ def fit_and_score(X, y, scoring, if not return_all: del SelectModel + # Check whether there are any features left + if len(X_train[0]) == 0: + # TODO: Make a specific WORC exception for this warning. + if verbose: + print('[WARNING]: No features are selected! Probably SelectFromModel could not properly select features. Parameters:') + print(parameters) + para_estimator = delete_nonestimator_parameters(para_estimator) + + if return_all: + return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, imputer, pca, StatisticalSel, ReliefSel, Sampler + else: + return ret + # ---------------------------------------------------------------- # PCA dimensionality reduction # Principle Component Analysis if 'UsePCA' in para_estimator.keys() and para_estimator['UsePCA'] == 'True': if verbose: print('Fitting PCA') - print("Original Length: " + str(len(feature_values[0]))) + print("\t Original Length: " + str(len(X_train[0]))) if para_estimator['PCAType'] == '95variance': # Select first X components that describe 95 percent of the explained variance pca = PCA(n_components=None, random_state=random_seed) try: - pca.fit(feature_values) + pca.fit(X_train) except (ValueError, LinAlgError) as e: - print(f'[WARNING]: skipping this setting due to PCA Error: {e}.') - ret = [train_score, test_score, test_sample_counts, - fit_time, score_time, para_estimator, para] + if verbose: + print(f'[WARNING]: skipping this setting due to PCA Error: {e}.') if return_all: - return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, imputer, pca, StatisticalSel, ReliefSel, Smote, RandOverSample + return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, imputer, pca, StatisticalSel, ReliefSel, Sampler else: return ret @@ -526,33 +554,35 @@ def fit_and_score(X, y, scoring, # Make a PCA based on the determined amound of components pca = PCA(n_components=num, random_state=random_seed) try: - pca.fit(feature_values) + pca.fit(X_train) except (ValueError, LinAlgError) as e: - print(f'[WARNING]: skipping this setting due to PCA Error: {e}.') - ret = [train_score, test_score, test_sample_counts, - fit_time, score_time, para_estimator, para] + if verbose: + print(f'[WARNING]: skipping this setting due to PCA Error: {e}.') if return_all: - return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, imputer, pca, StatisticalSel, ReliefSel, Smote, RandOverSample + return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, imputer, pca, StatisticalSel, ReliefSel, Sampler else: return ret - feature_values = pca.transform(feature_values) + X_train = pca.transform(X_train) + X_test = pca.transform(X_test) else: # Assume a fixed number of components: cannot be larger than # n_samples - n_components = min(len(feature_values), int(para_estimator['PCAType'])) + n_components = min(len(X_train), int(para_estimator['PCAType'])) - if n_components >= len(feature_values[0]): - print(f"[WORC WARNING] PCA n_components ({n_components})> n_features ({len(feature_values[0])}): skipping PCA.") + if n_components >= len(X_train[0]): + if verbose: + print(f"[WORC WARNING] PCA n_components ({n_components})> n_features ({len(X_train[0])}): skipping PCA.") else: pca = PCA(n_components=n_components, random_state=random_seed) - pca.fit(feature_values) - feature_values = pca.transform(feature_values) + pca.fit(X_train) + X_train = pca.transform(X_train) + X_test = pca.transform(X_test) if verbose: - print("New Length: " + str(len(feature_values[0]))) + print("\t New Length: " + str(len(X_train[0]))) # Delete the object if we do not need to return it if not return_all: @@ -569,18 +599,26 @@ def fit_and_score(X, y, scoring, metric = para_estimator['StatisticalTestMetric'] threshold = para_estimator['StatisticalTestThreshold'] if verbose: - print(f"Selecting features based on statistical test. Method {metric}, threshold {round(threshold, 2)}.") - if verbose: - print("Original Length: " + str(len(feature_values[0]))) + print(f"Selecting features based on statistical test. Method {metric}, threshold {round(threshold, 5)}.") + print("\t Original Length: " + str(len(X_train[0]))) StatisticalSel = StatisticalTestThreshold(metric=metric, threshold=threshold) - StatisticalSel.fit(feature_values, y) - feature_values = StatisticalSel.transform(feature_values) - feature_labels = StatisticalSel.transform(feature_labels) + StatisticalSel.fit(X_train, y) + X_train_temp = StatisticalSel.transform(X_train) + if len(X_train_temp[0]) == 0: + if verbose: + print('[WORC WARNING]: No features are selected! Probably your statistical test feature selection was too strict. Skipping thresholding.') + StatisticalSel = None + parameters['StatisticalTestUse'] = 'False' + else: + X_train = StatisticalSel.transform(X_train) + feature_labels = StatisticalSel.transform(feature_labels) + X_test = StatisticalSel.transform(X_test) + if verbose: - print("New Length: " + str(len(feature_values[0]))) + print("\t New Length: " + str(len(X_train[0]))) del para_estimator['StatisticalTestUse'] del para_estimator['StatisticalTestMetric'] del para_estimator['StatisticalTestThreshold'] @@ -589,105 +627,73 @@ def fit_and_score(X, y, scoring, if not return_all: del StatisticalSel - # -------------------------------------------------------------------- - # Final check if there are still features left - # Check whether there are any features left - if len(feature_values[0]) == 0: - # TODO: Make a specific WORC exception for this warning. - if verbose: - print('[WORC WARNING]: No features are selected! Probably you selected a feature group that is not in your feature file. Parameters:') - print(para) - - para_estimator = delete_nonestimator_parameters(para_estimator) - - # Return a zero performance dummy - ret = [train_score, test_score, test_sample_counts, - fit_time, score_time, para_estimator, para] - - if return_all: - return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, imputer, pca, StatisticalSel, ReliefSel, Smote, RandOverSample - else: - return ret - # ------------------------------------------------------------------------ - # Use SMOTE oversampling - if 'SampleProcessing_SMOTE' in para_estimator.keys(): - if para_estimator['SampleProcessing_SMOTE'] == 'True': + # Use object resampling + if 'Resampling_Use' in para_estimator.keys(): + if para_estimator['Resampling_Use'] == 'True': # Determine our starting balance - pos_initial = int(np.sum(y)) - neg_initial = int(len(y) - pos_initial) - len_in = len(y) + pos_initial = int(np.sum(y_train)) + neg_initial = int(len(y_train) - pos_initial) + len_in = len(y_train) - # Fit SMOTE object and transform dataset + # Fit ObjectSampler and transform dataset # NOTE: need to save random state for this one as well! - Smote = SMOTE(random_state=random_state, - ratio=para_estimator['SampleProcessing_SMOTE_ratio'], - m_neighbors=para_estimator['SampleProcessing_SMOTE_neighbors'], - kind='borderline1', - n_jobs=para_estimator['SampleProcessing_SMOTE_n_cores']) - - feature_values, y = Smote.fit_sample(feature_values, y) - - # Also make sure our feature label object has the same size - # NOTE: Not sure if this is the best implementation - feature_labels = np.asarray([feature_labels[0] for x in X]) - - # Note the user what SMOTE did - pos = int(np.sum(y)) - neg = int(len(y) - pos) - if verbose: - message = f"Sampling with SMOTE from {len_in} ({pos_initial} pos," +\ - f" {neg_initial} neg) to {len(y)} ({pos} pos, {neg} neg) patients." - print(message) + Sampler =\ + ObjectSampler(method=para_estimator['Resampling_Method'], + sampling_strategy=para_estimator['Resampling_sampling_strategy'], + n_jobs=para_estimator['Resampling_n_cores'], + n_neighbors=para_estimator['Resampling_n_neighbors'], + k_neighbors=para_estimator['Resampling_k_neighbors'], + threshold_cleaning=para_estimator['Resampling_threshold_cleaning'], + verbose=verbose) - del para_estimator['SampleProcessing_SMOTE'] - del para_estimator['SampleProcessing_SMOTE_ratio'] - del para_estimator['SampleProcessing_SMOTE_neighbors'] - del para_estimator['SampleProcessing_SMOTE_n_cores'] - - # Delete the object if we do not need to return it - if not return_all: - del Smote - - # ------------------------------------------------------------------------ - # Full Oversampling: To Do - if 'SampleProcessing_Oversampling' in para_estimator.keys(): - if para_estimator['SampleProcessing_Oversampling'] == 'True': - if verbose: - print('Oversample underrepresented classes in training.') + try: + Sampler.fit(X_train, y_train) + X_train_temp, y_train_temp = Sampler.transform(X_train, y_train) - # Oversample underrepresented classes in training - # We always use a factor 1, e.g. all classes end up with an - # equal number of samples - if len(y.shape) == 1: - # Single Class, use imblearn oversampling - RandOverSample = RandomOverSampler(random_state=random_state) - feature_values, y = RandOverSample.fit_sample(feature_values, y) + except ae.WORCValueError as e: + message = str(e) + if verbose: + print('[WORC WARNING] Skipping resampling: ' + message) + Sampler = None + parameters['Resampling_Use'] = 'False' else: - # Multi class, use own method as imblearn cannot do this - sumclass = [np.sum(y[:, i]) for i in range(y.shape[1])] - maxclass = np.argmax(sumclass) - for i in range(y.shape[1]): - if i != maxclass: - # Oversample - nz = np.nonzero(y[:, i])[0] - noversample = sumclass[maxclass] - sumclass[i] - while noversample > 0: - n_sample = random.randint(0, len(nz) - 1) - n_sample = nz[n_sample] - i_sample = y[n_sample, :] - x_sample = feature_values[n_sample] - y = np.vstack((y, i_sample)) - feature_values.append(x_sample) - noversample -= 1 - - del para_estimator['SampleProcessing_Oversampling'] + pos = int(np.sum(y_train_temp)) + neg = int(len(y_train_temp) - pos) + if pos < 10 or neg < 10: + if verbose: + print(f'[WORC WARNING] Skipping resampling: to few objects returned in one or both classes (pos: {pos}, neg: {neg}).') + Sampler = None + parameters['Resampling_Use'] = 'False' + else: + X_train = X_train_temp + y_train = y_train_temp + + # Notify the user what the resampling did + pos = int(np.sum(y_train)) + neg = int(len(y_train) - pos) + if verbose: + message = f"Resampling from {len_in} ({pos_initial} pos," +\ + f" {neg_initial} neg) to {len(y_train)} ({pos} pos, {neg} neg) patients." + print(message) + + # Also reset train and test indices + train = range(0, len(y_train)) + test = range(len(y_train), len(y_train) + len(y_test)) + + del para_estimator['Resampling_Use'] + del para_estimator['Resampling_Method'] + del para_estimator['Resampling_sampling_strategy'] + del para_estimator['Resampling_n_neighbors'] + del para_estimator['Resampling_k_neighbors'] + del para_estimator['Resampling_threshold_cleaning'] + del para_estimator['Resampling_n_cores'] # Delete the object if we do not need to return it if not return_all: - del RandOverSample + del Sampler # ---------------------------------------------------------------- # Fitting and scoring @@ -713,45 +719,43 @@ def fit_and_score(X, y, scoring, # Multiclass, hence employ a multiclass classifier for e.g. SVM, LR estimator.set_params(**para_estimator) estimator = OneVsRestClassifier(estimator) - para_estimator = {} if verbose: print("Fitting ML.") + # Recombine feature values and label for train and test set + feature_values = np.concatenate((X_train, X_test), axis=0) + y = np.concatenate((y_train, y_test), axis=0) + para_estimator = None + try: ret = _fit_and_score(estimator, feature_values, y, - scorer, train, + scorers, train, test, verbose, - para_estimator, fit_params, return_train_score, - return_parameters, - return_n_test_samples, - return_times, error_score) + para_estimator, fit_params, + return_train_score=return_train_score, + return_parameters=return_parameters, + return_n_test_samples=return_n_test_samples, + return_times=return_times, + return_estimator=return_estimator, + error_score=error_score) except (ValueError, LinAlgError) as e: if type(estimator) == LDA: - print(f'[WARNING]: skipping this setting due to LDA Error: {e}.') - ret = [train_score, test_score, test_sample_counts, - fit_time, score_time, para_estimator, para] + if verbose: + print(f'[WARNING]: skipping this setting due to LDA Error: {e}.') if return_all: - return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, imputer, pca, StatisticalSel, ReliefSel, Smote, RandOverSample + return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, imputer, pca, StatisticalSel, ReliefSel, Sampler else: return ret else: raise e - # Remove 'estimator object', it's the causes of a bug. - # Somewhere between scikit-learn 0.18.2 and 0.20.2 - # the estimator object return value was added - # removing this element fixes a bug that occurs later - # in SearchCV.py, where an array without estimator - # object is expected. - del ret[-1] - - # Paste original parameters in performance - ret.append(para) + # Add original parameters to return object + ret.append(parameters) if return_all: - return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, imputer, pca, StatisticalSel, ReliefSel, Smote, RandOverSample + return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, imputer, pca, StatisticalSel, ReliefSel, Sampler else: return ret @@ -768,6 +772,13 @@ def delete_nonestimator_parameters(parameters): del parameters['UsePCA'] del parameters['PCAType'] + if 'ReliefUse' in parameters.keys(): + del parameters['ReliefUse'] + del parameters['ReliefNN'] + del parameters['ReliefSampleSize'] + del parameters['ReliefDistanceP'] + del parameters['ReliefNumFeatures'] + if 'Imputation' in parameters.keys(): del parameters['Imputation'] del parameters['ImputationMethod'] @@ -790,14 +801,14 @@ def delete_nonestimator_parameters(parameters): del parameters['StatisticalTestMetric'] del parameters['StatisticalTestThreshold'] - if 'SampleProcessing_SMOTE' in parameters.keys(): - del parameters['SampleProcessing_SMOTE'] - del parameters['SampleProcessing_SMOTE_ratio'] - del parameters['SampleProcessing_SMOTE_neighbors'] - del parameters['SampleProcessing_SMOTE_n_cores'] - - if 'SampleProcessing_Oversampling' in parameters.keys(): - del parameters['SampleProcessing_Oversampling'] + if 'Resampling_Use' in parameters.keys(): + del parameters['Resampling_Use'] + del parameters['Resampling_Method'] + del parameters['Resampling_sampling_strategy'] + del parameters['Resampling_n_neighbors'] + del parameters['Resampling_k_neighbors'] + del parameters['Resampling_threshold_cleaning'] + del parameters['Resampling_n_cores'] if 'random_seed' in parameters.keys(): del parameters['random_seed'] diff --git a/WORC/classification/metrics.py b/WORC/classification/metrics.py index 56054b36..331299f3 100644 --- a/WORC/classification/metrics.py +++ b/WORC/classification/metrics.py @@ -261,6 +261,97 @@ def check_scoring(estimator, scoring=None, allow_none=False): return scorer +def check_multimetric_scoring(estimator, scoring=None): + """Wrapper around sklearn function to enable more scoring options. + + + Check the scoring parameter in cases when multiple metrics are allowed + + Parameters + ---------- + estimator : sklearn estimator instance + The estimator for which the scoring will be applied. + scoring : string, callable, list/tuple, dict or None, default: None + A single string (see :ref:`scoring_parameter`) or a callable + (see :ref:`scoring`) to evaluate the predictions on the test set. + For evaluating multiple metrics, either give a list of (unique) strings + or a dict with names as keys and callables as values. + NOTE that when using custom scorers, each scorer should return a single + value. Metric functions returning a list/array of values can be wrapped + into multiple scorers that return one value each. + See :ref:`multimetric_grid_search` for an example. + If None the estimator's score method is used. + The return value in that case will be ``{'score': }``. + If the estimator's score method is not available, a ``TypeError`` + is raised. + Returns + ------- + scorers_dict : dict + A dict mapping each scorer name to its validated scorer. + is_multimetric : bool + True if scorer is a list/tuple or dict of callables + False if scorer is None/str/callable + """ + if callable(scoring) or scoring is None or isinstance(scoring, + str): + scorers = {"score": check_scoring(estimator, scoring=scoring)} + return scorers, False + else: + err_msg_generic = ("scoring should either be a single string or " + "callable for single metric evaluation or a " + "list/tuple of strings or a dict of scorer name " + "mapped to the callable for multiple metric " + "evaluation. Got %s of type %s" + % (repr(scoring), type(scoring))) + + if isinstance(scoring, (list, tuple, set)): + err_msg = ("The list/tuple elements must be unique " + "strings of predefined scorers. ") + invalid = False + try: + keys = set(scoring) + except TypeError: + invalid = True + if invalid: + raise ValueError(err_msg) + + if len(keys) != len(scoring): + raise ValueError(err_msg + "Duplicate elements were found in" + " the given list. %r" % repr(scoring)) + elif len(keys) > 0: + if not all(isinstance(k, str) for k in keys): + if any(callable(k) for k in keys): + raise ValueError(err_msg + + "One or more of the elements were " + "callables. Use a dict of score name " + "mapped to the scorer callable. " + "Got %r" % repr(scoring)) + else: + raise ValueError(err_msg + + "Non-string types were found in " + "the given list. Got %r" + % repr(scoring)) + scorers = {scorer: check_scoring(estimator, scoring=scorer) + for scorer in scoring} + else: + raise ValueError(err_msg + + "Empty list was given. %r" % repr(scoring)) + + elif isinstance(scoring, dict): + keys = set(scoring) + if not all(isinstance(k, str) for k in keys): + raise ValueError("Non-string types were found in the keys of " + "the given dict. scoring=%r" % repr(scoring)) + if len(keys) == 0: + raise ValueError("An empty dict was passed. %r" + % repr(scoring)) + scorers = {key: check_scoring(estimator, scoring=scorer) + for key, scorer in scoring.items()} + else: + raise ValueError(err_msg_generic) + return scorers, True + + def ICC(M, ICCtype='inter'): ''' Input: diff --git a/WORC/classification/parameter_optimization.py b/WORC/classification/parameter_optimization.py index 366329db..691469aa 100644 --- a/WORC/classification/parameter_optimization.py +++ b/WORC/classification/parameter_optimization.py @@ -89,7 +89,6 @@ def random_search_parameters(features, labels, N_iter, test_size, print("Best found parameters:") for i in random_search.best_params_: print(f'{i}: {random_search.best_params_[i]}.') - print("\n Best score using best parameters:") - print(random_search.best_score_) + print(f"\n Best score using best parameters: {scoring_method} = {random_search.best_score_}") return random_search diff --git a/WORC/classification/trainclassifier.py b/WORC/classification/trainclassifier.py index 8bb86ed5..acc19c84 100644 --- a/WORC/classification/trainclassifier.py +++ b/WORC/classification/trainclassifier.py @@ -17,13 +17,12 @@ import json import os - +from scipy.stats import uniform from WORC.classification import crossval as cv from WORC.classification import construct_classifier as cc -from WORC.plotting.plot_SVM import plot_SVM +from WORC.plotting.plot_estimator_performance import plot_estimator_performance from WORC.IOparser.file_io import load_features import WORC.IOparser.config_io_classifier as config_io -from scipy.stats import uniform from WORC.classification.AdvancedSampler import discrete_uniform, \ log_uniform, boolean_uniform @@ -32,8 +31,9 @@ def trainclassifier(feat_train, patientinfo_train, config, output_hdf, output_json, feat_test=None, patientinfo_test=None, fixedsplits=None, verbose=True): - ''' - Train a classifier using machine learning from features. By default, if no + """Train a classifier using machine learning from features. + + By default, if no split in training and test is supplied, a cross validation will be performed. @@ -85,8 +85,7 @@ def trainclassifier(feat_train, patientinfo_train, config, verbose: boolean, default True print final feature values and labels to command line or not. - ''' - + """ # Convert inputs from lists to strings if type(patientinfo_train) is list: patientinfo_train = ''.join(patientinfo_train) @@ -162,15 +161,22 @@ def trainclassifier(feat_train, patientinfo_train, config, param_grid['FeatureScaling'] = config['FeatureScaling']['scaling_method'] # Add parameters for oversampling methods - param_grid['SampleProcessing_SMOTE'] = config['SampleProcessing']['SMOTE'] - param_grid['SampleProcessing_SMOTE_ratio'] =\ - uniform(loc=config['SampleProcessing']['SMOTE_ratio'][0], - scale=config['SampleProcessing']['SMOTE_ratio'][1]) - param_grid['SampleProcessing_SMOTE_neighbors'] =\ - discrete_uniform(loc=config['SampleProcessing']['SMOTE_neighbors'][0], - scale=config['SampleProcessing']['SMOTE_neighbors'][1]) - param_grid['SampleProcessing_SMOTE_n_cores'] = [config['General']['Joblib_ncores']] - param_grid['SampleProcessing_Oversampling'] = config['SampleProcessing']['Oversampling'] + param_grid['Resampling_Use'] =\ + boolean_uniform(threshold=config['Resampling']['Use']) + param_grid['Resampling_Method'] = config['Resampling']['Method'] + param_grid['Resampling_sampling_strategy'] =\ + config['Resampling']['sampling_strategy'] + param_grid['Resampling_n_neighbors'] =\ + discrete_uniform(loc=config['Resampling']['n_neighbors'][0], + scale=config['Resampling']['n_neighbors'][1]) + param_grid['Resampling_k_neighbors'] =\ + discrete_uniform(loc=config['Resampling']['k_neighbors'][0], + scale=config['Resampling']['k_neighbors'][1]) + param_grid['Resampling_threshold_cleaning'] =\ + uniform(loc=config['Resampling']['threshold_cleaning'][0], + scale=config['Resampling']['threshold_cleaning'][1]) + + param_grid['Resampling_n_cores'] = [config['General']['Joblib_ncores']] # Extract hyperparameter grid settings for SearchCV from config param_grid['FeatPreProcess'] = config['FeatPreProcess']['Use'] @@ -249,7 +255,7 @@ def trainclassifier(feat_train, patientinfo_train, config, if not os.path.exists(os.path.dirname(output_hdf)): os.makedirs(os.path.dirname(output_hdf)) - trained_classifier.to_hdf(output_hdf, 'SVMdata') + trained_classifier.to_hdf(output_hdf, 'EstimatorData') # Check whether we do regression or classification regressors = ['SVR', 'RFR', 'SGDR', 'Lasso', 'ElasticNet'] @@ -260,48 +266,53 @@ def trainclassifier(feat_train, patientinfo_train, config, overfit_scaler = config['Evaluation']['OverfitScaler'] if feat_test is None: if not isclassifier: - statistics = plot_SVM(trained_classifier, label_data_train, - label_type, ensemble=config['Ensemble']['Use'], - bootstrap=config['Bootstrap']['Use'], - bootstrap_N=config['Bootstrap']['N_iterations'], - overfit_scaler=overfit_scaler) + statistics =\ + plot_estimator_performance(trained_classifier, + label_data_train, + label_type, + ensemble=config['Ensemble']['Use'], + bootstrap=config['Bootstrap']['Use'], + bootstrap_N=config['Bootstrap']['N_iterations'], + overfit_scaler=overfit_scaler) else: - statistics = plot_SVM(trained_classifier, label_data_train, - label_type, modus=modus, - ensemble=config['Ensemble']['Use'], - bootstrap=config['Bootstrap']['Use'], - bootstrap_N=config['Bootstrap']['N_iterations'], - overfit_scaler=overfit_scaler) + statistics =\ + plot_estimator_performance(trained_classifier, + label_data_train, + label_type, modus=modus, + ensemble=config['Ensemble']['Use'], + bootstrap=config['Bootstrap']['Use'], + bootstrap_N=config['Bootstrap']['N_iterations'], + overfit_scaler=overfit_scaler) else: if patientinfo_test is not None: if not isclassifier: - statistics = plot_SVM(trained_classifier, - label_data_test, - label_type, - ensemble=config['Ensemble']['Use'], - bootstrap=config['Bootstrap']['Use'], - bootstrap_N=config['Bootstrap']['N_iterations'], - overfit_scaler=overfit_scaler) + statistics =\ + plot_estimator_performance(trained_classifier, + label_data_test, + label_type, + ensemble=config['Ensemble']['Use'], + bootstrap=config['Bootstrap']['Use'], + bootstrap_N=config['Bootstrap']['N_iterations'], + overfit_scaler=overfit_scaler) else: - statistics = plot_SVM(trained_classifier, - label_data_test, - label_type, - modus=modus, - ensemble=config['Ensemble']['Use'], - bootstrap=config['Bootstrap']['Use'], - bootstrap_N=config['Bootstrap']['N_iterations'], - overfit_scaler=overfit_scaler) + statistics =\ + plot_estimator_performance(trained_classifier, + label_data_test, + label_type, + modus=modus, + ensemble=config['Ensemble']['Use'], + bootstrap=config['Bootstrap']['Use'], + bootstrap_N=config['Bootstrap']['N_iterations'], + overfit_scaler=overfit_scaler) else: statistics = None # Save output - savedict = dict() - savedict["Statistics"] = statistics if not os.path.exists(os.path.dirname(output_json)): os.makedirs(os.path.dirname(output_json)) with open(output_json, 'w') as fp: - json.dump(savedict, fp, sort_keys=True, indent=4) + json.dump(statistics, fp, sort_keys=True, indent=4) print("Saved data!") diff --git a/WORC/doc/_build/doctrees/autogen/WORC.IOparser.doctree b/WORC/doc/_build/doctrees/autogen/WORC.IOparser.doctree index 234f595e..37001630 100644 Binary files a/WORC/doc/_build/doctrees/autogen/WORC.IOparser.doctree and b/WORC/doc/_build/doctrees/autogen/WORC.IOparser.doctree differ diff --git a/WORC/doc/_build/doctrees/autogen/WORC.classification.doctree b/WORC/doc/_build/doctrees/autogen/WORC.classification.doctree index 872cce98..1aa9b1a1 100644 Binary files a/WORC/doc/_build/doctrees/autogen/WORC.classification.doctree and b/WORC/doc/_build/doctrees/autogen/WORC.classification.doctree differ diff --git a/WORC/doc/_build/doctrees/autogen/WORC.config.doctree b/WORC/doc/_build/doctrees/autogen/WORC.config.doctree index 3f1ea0b9..84780e3b 100644 Binary files a/WORC/doc/_build/doctrees/autogen/WORC.config.doctree and b/WORC/doc/_build/doctrees/autogen/WORC.config.doctree differ diff --git a/WORC/doc/_build/doctrees/autogen/WORC.detectors.doctree b/WORC/doc/_build/doctrees/autogen/WORC.detectors.doctree index b9d0aed1..df1f0090 100644 Binary files a/WORC/doc/_build/doctrees/autogen/WORC.detectors.doctree and b/WORC/doc/_build/doctrees/autogen/WORC.detectors.doctree differ diff --git a/WORC/doc/_build/doctrees/autogen/WORC.doctree b/WORC/doc/_build/doctrees/autogen/WORC.doctree index da2bd94b..7e84230c 100644 Binary files a/WORC/doc/_build/doctrees/autogen/WORC.doctree and b/WORC/doc/_build/doctrees/autogen/WORC.doctree differ diff --git a/WORC/doc/_build/doctrees/autogen/WORC.exampledata.doctree b/WORC/doc/_build/doctrees/autogen/WORC.exampledata.doctree index cd080ad5..436aafc7 100644 Binary files a/WORC/doc/_build/doctrees/autogen/WORC.exampledata.doctree and b/WORC/doc/_build/doctrees/autogen/WORC.exampledata.doctree differ diff --git a/WORC/doc/_build/doctrees/autogen/WORC.facade.doctree b/WORC/doc/_build/doctrees/autogen/WORC.facade.doctree index 5f326d21..83aa6d4a 100644 Binary files a/WORC/doc/_build/doctrees/autogen/WORC.facade.doctree and b/WORC/doc/_build/doctrees/autogen/WORC.facade.doctree differ diff --git a/WORC/doc/_build/doctrees/autogen/WORC.featureprocessing.doctree b/WORC/doc/_build/doctrees/autogen/WORC.featureprocessing.doctree index ef8f9c29..9970fafe 100644 Binary files a/WORC/doc/_build/doctrees/autogen/WORC.featureprocessing.doctree and b/WORC/doc/_build/doctrees/autogen/WORC.featureprocessing.doctree differ diff --git a/WORC/doc/_build/doctrees/autogen/WORC.plotting.doctree b/WORC/doc/_build/doctrees/autogen/WORC.plotting.doctree index 02264526..81a7fc0b 100644 Binary files a/WORC/doc/_build/doctrees/autogen/WORC.plotting.doctree and b/WORC/doc/_build/doctrees/autogen/WORC.plotting.doctree differ diff --git a/WORC/doc/_build/doctrees/autogen/WORC.processing.doctree b/WORC/doc/_build/doctrees/autogen/WORC.processing.doctree index b26ca2eb..0019e7f6 100644 Binary files a/WORC/doc/_build/doctrees/autogen/WORC.processing.doctree and b/WORC/doc/_build/doctrees/autogen/WORC.processing.doctree differ diff --git a/WORC/doc/_build/doctrees/autogen/WORC.resources.doctree b/WORC/doc/_build/doctrees/autogen/WORC.resources.doctree index 9cb804a5..1580f4ae 100644 Binary files a/WORC/doc/_build/doctrees/autogen/WORC.resources.doctree and b/WORC/doc/_build/doctrees/autogen/WORC.resources.doctree differ diff --git a/WORC/doc/_build/doctrees/autogen/WORC.resources.fastr_tests.doctree b/WORC/doc/_build/doctrees/autogen/WORC.resources.fastr_tests.doctree index 3b15532b..17237717 100644 Binary files a/WORC/doc/_build/doctrees/autogen/WORC.resources.fastr_tests.doctree and b/WORC/doc/_build/doctrees/autogen/WORC.resources.fastr_tests.doctree differ diff --git a/WORC/doc/_build/doctrees/autogen/WORC.resources.fastr_tools.doctree b/WORC/doc/_build/doctrees/autogen/WORC.resources.fastr_tools.doctree index 5f50414d..03e58961 100644 Binary files a/WORC/doc/_build/doctrees/autogen/WORC.resources.fastr_tools.doctree and b/WORC/doc/_build/doctrees/autogen/WORC.resources.fastr_tools.doctree differ diff --git a/WORC/doc/_build/doctrees/autogen/WORC.tools.doctree b/WORC/doc/_build/doctrees/autogen/WORC.tools.doctree index 5de6228a..c1b76463 100644 Binary files a/WORC/doc/_build/doctrees/autogen/WORC.tools.doctree and b/WORC/doc/_build/doctrees/autogen/WORC.tools.doctree differ diff --git a/WORC/doc/_build/doctrees/environment.pickle b/WORC/doc/_build/doctrees/environment.pickle index 1276f9b1..18d38e79 100644 Binary files a/WORC/doc/_build/doctrees/environment.pickle and b/WORC/doc/_build/doctrees/environment.pickle differ diff --git a/WORC/doc/_build/doctrees/index.doctree b/WORC/doc/_build/doctrees/index.doctree index f8a478d7..038f2ec9 100644 Binary files a/WORC/doc/_build/doctrees/index.doctree and b/WORC/doc/_build/doctrees/index.doctree differ diff --git a/WORC/doc/_build/doctrees/static/changelog.doctree b/WORC/doc/_build/doctrees/static/changelog.doctree index e5eceee8..3f5a9360 100644 Binary files a/WORC/doc/_build/doctrees/static/changelog.doctree and b/WORC/doc/_build/doctrees/static/changelog.doctree differ diff --git a/WORC/doc/_build/doctrees/static/configuration.doctree b/WORC/doc/_build/doctrees/static/configuration.doctree index f6e0bce2..0dd8fb39 100644 Binary files a/WORC/doc/_build/doctrees/static/configuration.doctree and b/WORC/doc/_build/doctrees/static/configuration.doctree differ diff --git a/WORC/doc/_build/doctrees/static/file_description.doctree b/WORC/doc/_build/doctrees/static/file_description.doctree index b4bdbd41..859b7b09 100644 Binary files a/WORC/doc/_build/doctrees/static/file_description.doctree and b/WORC/doc/_build/doctrees/static/file_description.doctree differ diff --git a/WORC/doc/_build/doctrees/static/introduction.doctree b/WORC/doc/_build/doctrees/static/introduction.doctree index 36565909..48deb457 100644 Binary files a/WORC/doc/_build/doctrees/static/introduction.doctree and b/WORC/doc/_build/doctrees/static/introduction.doctree differ diff --git a/WORC/doc/_build/doctrees/static/quick_start.doctree b/WORC/doc/_build/doctrees/static/quick_start.doctree index b87a012a..de6f6e93 100644 Binary files a/WORC/doc/_build/doctrees/static/quick_start.doctree and b/WORC/doc/_build/doctrees/static/quick_start.doctree differ diff --git a/WORC/doc/_build/doctrees/static/user_manual.doctree b/WORC/doc/_build/doctrees/static/user_manual.doctree index 4d389d13..9b9a4f2c 100644 Binary files a/WORC/doc/_build/doctrees/static/user_manual.doctree and b/WORC/doc/_build/doctrees/static/user_manual.doctree differ diff --git a/WORC/doc/_build/html/.buildinfo b/WORC/doc/_build/html/.buildinfo index 730cead4..f74ff1de 100644 --- a/WORC/doc/_build/html/.buildinfo +++ b/WORC/doc/_build/html/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 12f9a462ed9ce3dfa0527f8388e8f05b +config: e9775213facab44d87ce264aaffb5f6f tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/WORC/doc/_build/html/_modules/WORC/IOparser/config_WORC.html b/WORC/doc/_build/html/_modules/WORC/IOparser/config_WORC.html index f15f1552..d4d1949f 100644 --- a/WORC/doc/_build/html/_modules/WORC/IOparser/config_WORC.html +++ b/WORC/doc/_build/html/_modules/WORC/IOparser/config_WORC.html @@ -1,39 +1,40 @@ - - + - WORC.IOparser.config_WORC — WORC 3.2.2 documentation + WORC.IOparser.config_WORC — WORC 3.3.0 documentation + + + + + - + - - - - + + + + - - - - - @@ -49,17 +50,20 @@ - WORC + WORC + + +
- 3.2.2 + 3.3.0
@@ -76,6 +80,7 @@ + + @@ -139,7 +146,7 @@
    -
  • Docs »
  • +
  • »
  • Module code »
  • @@ -192,7 +199,7 @@

    Source code for WORC.IOparser.config_WORC

             settings_dict: dictionary containing all parsed settings.
         """
         if not os.path.exists(config_file_path):
    -        e = f'File {config_file_path} does not exist!'
    +        e = f'File {config_file_path} does not exist!'
             raise ae.WORCKeyError(e)
     
         settings = configparser.ConfigParser()
    @@ -230,11 +237,19 @@ 

    Source code for WORC.IOparser.config_WORC

     
       

    + © Copyright 2016 -- 2020, Biomedical Imaging Group Rotterdam, Departments of Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands

    - Built with Sphinx using a theme provided by Read the Docs. + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. @@ -246,7 +261,6 @@

    Source code for WORC.IOparser.config_WORC

       
    - + - - - - + + + + - - - - - @@ -49,17 +50,20 @@ - WORC + WORC + + +
    - 3.2.2 + 3.3.0
    @@ -76,6 +80,7 @@
    + +
    @@ -139,7 +146,7 @@
      -
    • Docs »
    • +
    • »
    • Module code »
    • @@ -183,17 +190,17 @@

      Source code for WORC.IOparser.config_io_classifier

      [docs]def load_config(config_file_path): - """ - Load the config ini, parse settings to WORC + """Load the config ini, parse settings to WORC. Args: config_file_path (String): path of the .ini config file Returns: settings_dict (dict): dict with the loaded settings + """ if not os.path.exists(config_file_path): - e = f'File {config_file_path} does not exist!' + e = f'File {config_file_path} does not exist!' raise ae.WORCKeyError(e) settings = configparser.ConfigParser() @@ -203,7 +210,7 @@

      Source code for WORC.IOparser.config_io_classifier

      'Labels': dict(), 'HyperOptimization': dict(), 'Classification': dict(), 'SelectFeatGroup': dict(), 'Featsel': dict(), 'FeatureScaling': dict(), - 'SampleProcessing': dict(), 'Imputation': dict(), + 'Resampling': dict(), 'Imputation': dict(), 'Ensemble': dict(), 'Bootstrap': dict(), 'FeatPreProcess': dict(), 'Evaluation': dict()} @@ -219,6 +226,13 @@

      Source code for WORC.IOparser.config_io_classifier

      settings_dict['General']['tempsave'] =\ settings['General'].getboolean('tempsave') + # Feature Scaling + settings_dict['FeatureScaling']['scale_features'] =\ + settings['FeatureScaling'].getboolean('scale_features') + settings_dict['FeatureScaling']['scaling_method'] =\ + str(settings['FeatureScaling']['scaling_method']) + + # Feature selection settings_dict['Featsel']['Variance'] =\ settings['Featsel'].getfloat('Variance') @@ -291,6 +305,30 @@

      Source code for WORC.IOparser.config_io_classifier

      [str(item).strip() for item in settings['SelectFeatGroup'][key].split(',')] + # Settings for sample processing, i.e. oversampling, undersampling etc + settings_dict['Resampling']['Use'] =\ + settings['Resampling'].getfloat('Use') + + settings_dict['Resampling']['Method'] =\ + [str(item).strip() for item in + settings['Resampling']['Method'].split(',')] + + settings_dict['Resampling']['sampling_strategy'] =\ + [str(item).strip() for item in + settings['Resampling']['sampling_strategy'].split(',')] + + settings_dict['Resampling']['n_neighbors'] =\ + [int(str(item).strip()) for item in + settings['Resampling']['n_neighbors'].split(',')] + + settings_dict['Resampling']['k_neighbors'] =\ + [int(str(item).strip()) for item in + settings['Resampling']['k_neighbors'].split(',')] + + settings_dict['Resampling']['threshold_cleaning'] =\ + [float(str(item).strip()) for item in + settings['Resampling']['threshold_cleaning'].split(',')] + # Classification options settings_dict['Classification']['fastr'] =\ settings['Classification'].getboolean('fastr') @@ -418,28 +456,6 @@

      Source code for WORC.IOparser.config_io_classifier

      settings_dict['HyperOptimization']['ranking_score'] = \ str(settings['HyperOptimization']['ranking_score']) - settings_dict['FeatureScaling']['scale_features'] =\ - settings['FeatureScaling'].getboolean('scale_features') - settings_dict['FeatureScaling']['scaling_method'] =\ - str(settings['FeatureScaling']['scaling_method']) - - # Settings for sample processing, i.e. oversampling, undersampling etc - settings_dict['SampleProcessing']['SMOTE'] =\ - [str(item).strip() for item in - settings['SampleProcessing']['SMOTE'].split(',')] - - settings_dict['SampleProcessing']['SMOTE_ratio'] =\ - [int(str(item).strip()) for item in - settings['SampleProcessing']['SMOTE_ratio'].split(',')] - - settings_dict['SampleProcessing']['SMOTE_neighbors'] =\ - [int(str(item).strip()) for item in - settings['SampleProcessing']['SMOTE_neighbors'].split(',')] - - settings_dict['SampleProcessing']['Oversampling'] =\ - [str(item).strip() for item in - settings['SampleProcessing']['Oversampling'].split(',')] - # Settings for ensembling settings_dict['Ensemble']['Use'] =\ settings['Ensemble'].getint('Use') @@ -468,11 +484,19 @@

      Source code for WORC.IOparser.config_io_classifier

      + © Copyright 2016 -- 2020, Biomedical Imaging Group Rotterdam, Departments of Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands

      - Built with Sphinx using a theme provided by Read the Docs. + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. @@ -484,7 +508,6 @@

      Source code for WORC.IOparser.config_io_classifier

      jQuery(function () { SphinxRtdTheme.Navigation.enable(true); diff --git a/WORC/doc/_build/html/_modules/WORC/IOparser/config_preprocessing.html b/WORC/doc/_build/html/_modules/WORC/IOparser/config_preprocessing.html index 961843f9..447ac3f5 100644 --- a/WORC/doc/_build/html/_modules/WORC/IOparser/config_preprocessing.html +++ b/WORC/doc/_build/html/_modules/WORC/IOparser/config_preprocessing.html @@ -1,39 +1,40 @@ - - + - WORC.IOparser.config_preprocessing — WORC 3.2.2 documentation + WORC.IOparser.config_preprocessing — WORC 3.3.0 documentation + + + + + - + - - - - + + + + - - - - - @@ -49,17 +50,20 @@ - WORC + WORC + + +
      - 3.2.2 + 3.3.0
      @@ -76,6 +80,7 @@
      + +
      @@ -139,7 +146,7 @@
        -
      • Docs »
      • +
      • »
      • Module code »
      • @@ -192,7 +199,7 @@

        Source code for WORC.IOparser.config_preprocessing

        settings_dict: dictionary containing all parsed settings. """ if not os.path.exists(config_file_path): - e = f'File {config_file_path} does not exist!' + e = f'File {config_file_path} does not exist!' raise ae.WORCKeyError(e) settings = configparser.ConfigParser() @@ -239,11 +246,19 @@

        Source code for WORC.IOparser.config_preprocessing

        + © Copyright 2016 -- 2020, Biomedical Imaging Group Rotterdam, Departments of Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands

        - Built with Sphinx using a theme provided by Read the Docs. + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. @@ -255,7 +270,6 @@

        Source code for WORC.IOparser.config_preprocessing

        jQuery(function () { SphinxRtdTheme.Navigation.enable(true); diff --git a/WORC/doc/_build/html/_modules/WORC/IOparser/config_segmentix.html b/WORC/doc/_build/html/_modules/WORC/IOparser/config_segmentix.html index 0dda7392..34ab182c 100644 --- a/WORC/doc/_build/html/_modules/WORC/IOparser/config_segmentix.html +++ b/WORC/doc/_build/html/_modules/WORC/IOparser/config_segmentix.html @@ -1,39 +1,40 @@ - - + - WORC.IOparser.config_segmentix — WORC 3.2.2 documentation + WORC.IOparser.config_segmentix — WORC 3.3.0 documentation + + + + + - + - - - - + + + + - - - - - @@ -49,17 +50,20 @@ - WORC + WORC + + +
        - 3.2.2 + 3.3.0
        @@ -76,6 +80,7 @@
        + +
        @@ -139,7 +146,7 @@
          -
        • Docs »
        • +
        • »
        • Module code »
        • @@ -192,7 +199,7 @@

          Source code for WORC.IOparser.config_segmentix

          settings_dict: dictionary containing all parsed settings. """ if not os.path.exists(config_file_path): - e = f'File {config_file_path} does not exist!' + e = f'File {config_file_path} does not exist!' raise ae.WORCKeyError(e) settings = configparser.ConfigParser() @@ -239,11 +246,19 @@

          Source code for WORC.IOparser.config_segmentix

          + © Copyright 2016 -- 2020, Biomedical Imaging Group Rotterdam, Departments of Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands

          - Built with Sphinx using a theme provided by Read the Docs. + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. @@ -255,7 +270,6 @@

          Source code for WORC.IOparser.config_segmentix

          - + - - - - + + + + - - - - - @@ -49,17 +50,20 @@ - WORC + WORC + + +

          - 3.2.2 + 3.3.0
          @@ -76,6 +80,7 @@
          + +
          @@ -139,7 +146,7 @@
            -
          • Docs »
          • +
          • »
          • Module code »
          • @@ -236,11 +243,11 @@

            Source code for WORC.IOparser.file_io

                 # Check when we found patient ID's, if we did for all objects
                 if pids:
                     if len(pids) != len(image_features_temp):
            -            raise WORCexceptions.WORCValueError(f'Length of pids {len(pids)}' +
            +            raise WORCexceptions.WORCValueError(f'Length of pids {len(pids)}' +
                                                             'does not match' +
                                                             'number of objects ' +
                                                             str(len(image_features_temp)) +
            -                                                f'Found {pids}.')
            +                                                f'Found {pids}.')
             
                 # If some objects miss certain features, we will identify these with NaN values
                 feature_labels_all.sort()
            @@ -442,11 +449,19 @@ 

            Source code for WORC.IOparser.file_io

             
               

            + © Copyright 2016 -- 2020, Biomedical Imaging Group Rotterdam, Departments of Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands

            - Built with Sphinx using a theme provided by Read the Docs. + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. @@ -458,7 +473,6 @@

            Source code for WORC.IOparser.file_io

               
            - + - - - - + + + + - - - - - @@ -49,17 +50,20 @@ - WORC + WORC + + +
            - 3.2.2 + 3.3.0
            @@ -76,6 +80,7 @@
            + +
            @@ -139,7 +146,7 @@
              -
            • Docs »
            • +
            • »
            • Module code »
            • @@ -176,21 +183,21 @@

              Source code for WORC.WORC

               # See the License for the specific language governing permissions and
               # limitations under the License.
               
              -import configparser
              -import fastr
              -from fastr.api import ResourceLimit
               import os
              -from random import randint
              +import yaml
              +import fastr
               import graphviz
              +import configparser
              +from pathlib import Path
              +from random import randint
              +import WORC.IOparser.file_io as io
              +from fastr.api import ResourceLimit
              +from WORC.tools.Slicer import Slicer
              +from WORC.tools.Elastix import Elastix
              +from WORC.tools.Evaluate import Evaluate
               import WORC.addexceptions as WORCexceptions
               import WORC.IOparser.config_WORC as config_io
              -from WORC.tools.Elastix import Elastix
              -from WORC.tools.Evaluate import Evaluate
              -from WORC.tools.Slicer import Slicer
              -from WORC.detectors.detectors import DebugDetector
              -from pathlib import Path
              -import yaml
              -import WORC.IOparser.file_io as io
              +from WORC.detectors.detectors import DebugDetector
               
               
               
              [docs]class WORC(object): @@ -265,10 +272,8 @@

              Source code for WORC.WORC

                       CopyMetadata: Boolean, default True
                           when using elastix, copy metadata from image to segmentation or not
               
              -
                   """
              -
              -
              [docs] def __init__(self, name='test'): +
              [docs] def __init__(self, name='test'): """Initialize WORC object. Set the initial variables all to None, except for some defaults. @@ -346,14 +351,21 @@

              Source code for WORC.WORC

                       config['General']['Segmentix'] = 'True'
                       config['General']['FeatureCalculators'] = '[predict/CalcFeatures:1.0, pyradiomics/Pyradiomics:1.0]'
                       config['General']['Preprocessing'] = 'worc/PreProcess:1.0'
              -        config['General']['RegistrationNode'] = "'elastix4.8/Elastix:4.8'"
              -        config['General']['TransformationNode'] = "'elastix4.8/Transformix:4.8'"
              +        config['General']['RegistrationNode'] = "elastix4.8/Elastix:4.8"
              +        config['General']['TransformationNode'] = "elastix4.8/Transformix:4.8"
                       config['General']['Joblib_ncores'] = '1'
                       config['General']['Joblib_backend'] = 'threading'
                       config['General']['tempsave'] = 'False'
                       config['General']['AssumeSameImageAndMaskMetadata'] = 'False'
                       config['General']['ComBat'] = 'False'
               
              +        # Options for the object/patient labels that are used
              +        config['Labels'] = dict()
              +        config['Labels']['label_names'] = 'Label1, Label2'
              +        config['Labels']['modus'] = 'singlelabel'
              +        config['Labels']['url'] = 'WIP'
              +        config['Labels']['projectID'] = 'WIP'
              +
                       # Preprocessing
                       config['Preprocessing'] = dict()
                       config['Preprocessing']['Normalize'] = 'True'
              @@ -459,12 +471,24 @@ 

              Source code for WORC.WORC

                       config['ComBat'] = dict()
                       config['ComBat']['language'] = 'python'
                       config['ComBat']['batch'] = 'Hospital'
              +        config['ComBat']['mod'] = '[]'
                       config['ComBat']['par'] = '1'
                       config['ComBat']['eb'] = '1'
                       config['ComBat']['per_feature'] = '0'
                       config['ComBat']['excluded_features'] = 'sf_, of_, semf_, pf_'
                       config['ComBat']['matlab'] = 'C:\\Program Files\\MATLAB\\R2015b\\bin\\matlab.exe'
               
              +        # Feature imputation
              +        config['Imputation'] = dict()
              +        config['Imputation']['use'] = 'True'
              +        config['Imputation']['strategy'] = 'mean, median, most_frequent, constant, knn'
              +        config['Imputation']['n_neighbors'] = '5, 5'
              +
              +        # Feature scaling options
              +        config['FeatureScaling'] = dict()
              +        config['FeatureScaling']['scale_features'] = 'True'
              +        config['FeatureScaling']['scaling_method'] = 'z_score, robust, minmax'
              +
                       # Feature preprocessing before all below takes place
                       config['FeatPreProcess'] = dict()
                       config['FeatPreProcess']['Use'] = 'False'
              @@ -517,11 +541,17 @@ 

              Source code for WORC.WORC

                       config['SelectFeatGroup']['wavelet_features'] = 'True, False'
                       config['SelectFeatGroup']['log_features'] = 'True, False'
               
              -        # Feature imputation
              -        config['Imputation'] = dict()
              -        config['Imputation']['use'] = 'True'
              -        config['Imputation']['strategy'] = 'mean, median, most_frequent, constant, knn'
              -        config['Imputation']['n_neighbors'] = '5, 5'
              +        # Resampling options
              +        config['Resampling'] = dict()
              +        config['Resampling']['Use'] = '0.20'
              +        config['Resampling']['Method'] =\
              +            'RandomUnderSampling, RandomOverSampling, NearMiss, ' +\
              +            'NeighbourhoodCleaningRule, ADASYN, BorderlineSMOTE, SMOTE, ' +\
              +            'SMOTEENN, SMOTETomek'
              +        config['Resampling']['sampling_strategy'] = 'auto, majority, not minority, not majority, all'
              +        config['Resampling']['n_neighbors'] = '3, 12'
              +        config['Resampling']['k_neighbors'] = '5, 15'
              +        config['Resampling']['threshold_cleaning'] = '0.25, 0.5'
               
                       # Classification
                       config['Classification'] = dict()
              @@ -537,7 +567,7 @@ 

              Source code for WORC.WORC

                       config['Classification']['RFn_estimators'] = '10, 90'
                       config['Classification']['RFmin_samples_split'] = '2, 3'
                       config['Classification']['RFmax_depth'] = '5, 5'
              -        config['Classification']['LRpenalty'] = 'l2, l1'
              +        config['Classification']['LRpenalty'] = 'l2'
                       config['Classification']['LRC'] = '0.01, 1.0'
                       config['Classification']['LDA_solver'] = 'svd, lsqr, eigen'
                       config['Classification']['LDA_shrinkage'] = '-5, 5'
              @@ -556,36 +586,16 @@ 

              Source code for WORC.WORC

                       config['CrossValidation']['test_size'] = '0.2'
                       config['CrossValidation']['fixed_seed'] = 'False'
               
              -        # Options for the object/patient labels that are used
              -        config['Labels'] = dict()
              -        config['Labels']['label_names'] = 'Label1, Label2'
              -        config['ComBat']['mod'] = config['Labels']['label_names']  # Variation due to label to predict should be maintained
              -        config['Labels']['modus'] = 'singlelabel'
              -        config['Labels']['url'] = 'WIP'
              -        config['Labels']['projectID'] = 'WIP'
              -
                       # Hyperparameter optimization options
                       config['HyperOptimization'] = dict()
                       config['HyperOptimization']['scoring_method'] = 'f1_weighted'
                       config['HyperOptimization']['test_size'] = '0.15'
                       config['HyperOptimization']['n_splits'] = '5'
              -        config['HyperOptimization']['N_iterations'] = '10000'
              -        config['HyperOptimization']['n_jobspercore'] = '2000'  # only relevant when using fastr in classification
              +        config['HyperOptimization']['N_iterations'] = '25000'
              +        config['HyperOptimization']['n_jobspercore'] = '1000'  # only relevant when using fastr in classification
                       config['HyperOptimization']['maxlen'] = '100'
                       config['HyperOptimization']['ranking_score'] = 'test_score'
               
              -        # Feature scaling options
              -        config['FeatureScaling'] = dict()
              -        config['FeatureScaling']['scale_features'] = 'True'
              -        config['FeatureScaling']['scaling_method'] = 'z_score'
              -
              -        # Sample processing options
              -        config['SampleProcessing'] = dict()
              -        config['SampleProcessing']['SMOTE'] = 'True, False'
              -        config['SampleProcessing']['SMOTE_ratio'] = '1, 0'
              -        config['SampleProcessing']['SMOTE_neighbors'] = '5, 15'
              -        config['SampleProcessing']['Oversampling'] = 'False'
              -
                       # Ensemble options
                       config['Ensemble'] = dict()
                       config['Ensemble']['Use'] = '50'
              @@ -650,20 +660,24 @@ 

              Source code for WORC.WORC

                                   image_types.append(self.configs[c]['ImageFeatures']['image_type'])
               
                               # Create config source
              -                self.source_class_config = self.network.create_source('ParameterFile', id='config_classification_source', node_group='conf')
              +                self.source_class_config = self.network.create_source('ParameterFile', id='config_classification_source', node_group='conf', step_id='general_sources')
               
                               # Classification tool and label source
              -                self.source_patientclass_train = self.network.create_source('PatientInfoFile', id='patientclass_train', node_group='pctrain')
              +                self.source_patientclass_train = self.network.create_source('PatientInfoFile', id='patientclass_train', node_group='pctrain', step_id='train_sources')
                               if self.labels_test:
              -                    self.source_patientclass_test = self.network.create_source('PatientInfoFile', id='patientclass_test', node_group='pctest')
              +                    self.source_patientclass_test = self.network.create_source('PatientInfoFile', id='patientclass_test', node_group='pctest', step_id='test_sources')
               
                               memory = self.fastr_memory_parameters['Classification']
              -                self.classify = self.network.create_node('worc/TrainClassifier:1.0', tool_version='1.0', id='classify', resources=ResourceLimit(memory=memory))
              +                self.classify = self.network.create_node('worc/TrainClassifier:1.0',
              +                                                         tool_version='1.0',
              +                                                         id='classify',
              +                                                         resources=ResourceLimit(memory=memory),
              +                                                         step_id='WorkflowOptimization')
               
                               # Outputs
              -                self.sink_classification = self.network.create_sink('HDF5', id='classification')
              -                self.sink_performance = self.network.create_sink('JsonFile', id='performance')
              -                self.sink_class_config = self.network.create_sink('ParameterFile', id='config_classification_sink', node_group='conf')
              +                self.sink_classification = self.network.create_sink('HDF5', id='classification', step_id='general_sinks')
              +                self.sink_performance = self.network.create_sink('JsonFile', id='performance', step_id='general_sinks')
              +                self.sink_class_config = self.network.create_sink('ParameterFile', id='config_classification_sink', node_group='conf', step_id='general_sinks')
               
                               # Links
                               self.sink_class_config.input = self.source_class_config.output
              @@ -742,9 +756,9 @@ 

              Source code for WORC.WORC

                                   else:
                                       nseg = len(self.segmentations_train)
                                       nim = len(image_types)
              -                        m = f'Length of segmentations for training is ' +\
              -                            f'{nseg}: should be equal to number of images' +\
              -                            f' ({nim}) or 1 when using registration.'
              +                        m = f'Length of segmentations for training is ' +\
              +                            f'{nseg}: should be equal to number of images' +\
              +                            f' ({nim}) or 1 when using registration.'
                                       raise WORCexceptions.WORCValueError(m)
               
                                   # BUG: We assume that first type defines if we use segmentix
              @@ -792,38 +806,62 @@ 

              Source code for WORC.WORC

                                       self.modlabels.append(label)
               
                                       # Create required sources and sinks
              -                        self.sources_parameters[label] = self.network.create_source('ParameterFile', id='config_' + label)
              -                        self.sources_images_train[label] = self.network.create_source('ITKImageFile', id='images_train_' + label, node_group='train')
              +                        self.sources_parameters[label] = self.network.create_source('ParameterFile', id='config_' + label, step_id='general_sources')
              +                        self.sources_images_train[label] = self.network.create_source('ITKImageFile', id='images_train_' + label, node_group='train', step_id='train_sources')
                                       if self.TrainTest:
              -                            self.sources_images_test[label] = self.network.create_source('ITKImageFile', id='images_test_' + label, node_group='test')
              +                            self.sources_images_test[label] = self.network.create_source('ITKImageFile', id='images_test_' + label, node_group='test', step_id='test_sources')
               
                                       if self.metadata_train and len(self.metadata_train) >= nmod + 1:
              -                            self.sources_metadata_train[label] = self.network.create_source('DicomImageFile', id='metadata_train_' + label, node_group='train')
              +                            self.sources_metadata_train[label] = self.network.create_source('DicomImageFile', id='metadata_train_' + label, node_group='train', step_id='train_sources')
               
                                       if self.metadata_test and len(self.metadata_test) >= nmod + 1:
              -                            self.sources_metadata_test[label] = self.network.create_source('DicomImageFile', id='metadata_test_' + label, node_group='test')
              +                            self.sources_metadata_test[label] = self.network.create_source('DicomImageFile', id='metadata_test_' + label, node_group='test', step_id='test_sources')
               
                                       if self.masks_train and len(self.masks_train) >= nmod + 1:
                                           # Create mask source and convert
              -                            self.sources_masks_train[label] = self.network.create_source('ITKImageFile', id='mask_train_' + label, node_group='train')
              +                            self.sources_masks_train[label] = self.network.create_source('ITKImageFile', id='mask_train_' + label, node_group='train', step_id='train_sources')
                                           memory = self.fastr_memory_parameters['WORCCastConvert']
              -                            self.converters_masks_train[label] = self.network.create_node('worc/WORCCastConvert:0.3.2', tool_version='0.1', id='convert_mask_train_' + label, node_group='train', resources=ResourceLimit(memory=memory))
              +                            self.converters_masks_train[label] =\
              +                                self.network.create_node('worc/WORCCastConvert:0.3.2',
              +                                                         tool_version='0.1',
              +                                                         id='convert_mask_train_' + label,
              +                                                         node_group='train',
              +                                                         resources=ResourceLimit(memory=memory),
              +                                                         step_id='FileConversion')
              +
                                           self.converters_masks_train[label].inputs['image'] = self.sources_masks_train[label].output
               
                                       if self.masks_test and len(self.masks_test) >= nmod + 1:
                                           # Create mask source and convert
              -                            self.sources_masks_test[label] = self.network.create_source('ITKImageFile', id='mask_test_' + label, node_group='test')
              +                            self.sources_masks_test[label] = self.network.create_source('ITKImageFile', id='mask_test_' + label, node_group='test', step_id='test_sources')
                                           memory = self.fastr_memory_parameters['WORCCastConvert']
              -                            self.converters_masks_test[label] = self.network.create_node('worc/WORCCastConvert:0.3.2', tool_version='0.1', id='convert_mask_test_' + label, node_group='test', resources=ResourceLimit(memory=memory))
              +                            self.converters_masks_test[label] =\
              +                                self.network.create_node('worc/WORCCastConvert:0.3.2',
              +                                                         tool_version='0.1',
              +                                                         id='convert_mask_test_' + label,
              +                                                         node_group='test',
              +                                                         resources=ResourceLimit(memory=memory),
              +                                                         step_id='FileConversion')
              +
                                           self.converters_masks_test[label].inputs['image'] = self.sources_masks_test[label].output
               
                                       # First convert the images
                                       if any(modality in mod for modality in ['MR', 'CT', 'MG', 'PET']):
                                           # Use WORC PXCastConvet for converting image formats
                                           memory = self.fastr_memory_parameters['WORCCastConvert']
              -                            self.converters_im_train[label] = self.network.create_node('worc/WORCCastConvert:0.3.2', tool_version='0.1', id='convert_im_train_' + label, resources=ResourceLimit(memory=memory))
              +                            self.converters_im_train[label] =\
              +                                self.network.create_node('worc/WORCCastConvert:0.3.2',
              +                                                         tool_version='0.1',
              +                                                         id='convert_im_train_' + label,
              +                                                         resources=ResourceLimit(memory=memory),
              +                                                         step_id='FileConversion')
                                           if self.TrainTest:
              -                                self.converters_im_test[label] = self.network.create_node('worc/WORCCastConvert:0.3.2', tool_version='0.1', id='convert_im_test_' + label, resources=ResourceLimit(memory=memory))
              +                                self.converters_im_test[label] =\
              +                                    self.network.create_node('worc/WORCCastConvert:0.3.2',
              +                                                             tool_version='0.1',
              +                                                             id='convert_im_test_' + label,
              +                                                             resources=ResourceLimit(memory=memory),
              +                                                             step_id='FileConversion')
               
                                       else:
                                           raise WORCexceptions.WORCTypeError(('No valid image type for modality {}: {} provided.').format(str(nmod), mod))
              @@ -854,7 +892,7 @@ 

              Source code for WORC.WORC

                                           self.featureconverter_test[label] = list()
               
                                       for f in feature_calculators:
              -                            print(f'\t - Adding feature calculation node: {f}.')
              +                            print(f'\t - Adding feature calculation node: {f}.')
                                           self.add_feature_calculator(f, label, nmod)
               
                                       # -----------------------------------------------------
              @@ -866,13 +904,15 @@ 

              Source code for WORC.WORC

                                           self.sources_segmentations_train[label] =\
                                               self.network.create_source('ITKImageFile',
                                                                          id='segmentations_train_' + label,
              -                                                           node_group='train')
              +                                                           node_group='train',
              +                                                           step_id='train_sources')
               
                                           self.converters_seg_train[label] =\
                                               self.network.create_node('worc/WORCCastConvert:0.3.2',
                                                                        tool_version='0.1',
                                                                        id='convert_seg_train_' + label,
              -                                                         resources=ResourceLimit(memory=memory))
              +                                                         resources=ResourceLimit(memory=memory),
              +                                                         step_id='FileConversion')
               
                                           self.converters_seg_train[label].inputs['image'] =\
                                               self.sources_segmentations_train[label].output
              @@ -881,13 +921,15 @@ 

              Source code for WORC.WORC

                                               self.sources_segmentations_test[label] =\
                                                   self.network.create_source('ITKImageFile',
                                                                              id='segmentations_test_' + label,
              -                                                               node_group='test')
              +                                                               node_group='test',
              +                                                               step_id='test_sources')
               
                                               self.converters_seg_test[label] =\
                                                   self.network.create_node('worc/WORCCastConvert:0.3.2',
                                                                            tool_version='0.1',
                                                                            id='convert_seg_test_' + label,
              -                                                             resources=ResourceLimit(memory=memory))
              +                                                             resources=ResourceLimit(memory=memory),
              +                                                             step_id='FileConversion')
               
                                               self.converters_seg_test[label].inputs['image'] =\
                                                   self.sources_segmentations_test[label].output
              @@ -937,13 +979,13 @@ 

              Source code for WORC.WORC

                                           # Link features to ComBat
                                           self.links_Combat1_train[label] = list()
                                           for i_node, fname in enumerate(self.featurecalculators[label]):
              -                                self.links_Combat1_train[label].append(self.ComBat.inputs['features_train'][f'{label}_{self.featurecalculators[label][i_node]}'] << self.featureconverter_train[label][i_node].outputs['feat_out'])
              +                                self.links_Combat1_train[label].append(self.ComBat.inputs['features_train'][f'{label}_{self.featurecalculators[label][i_node]}'] << self.featureconverter_train[label][i_node].outputs['feat_out'])
                                               self.links_Combat1_train[label][i_node].collapse = 'train'
               
                                           if self.TrainTest:
                                               self.links_Combat1_test[label] = list()
                                               for i_node, fname in enumerate(self.featurecalculators[label]):
              -                                    self.links_Combat1_test[label].append(self.ComBat.inputs['features_test'][f'{label}_{self.featurecalculators[label][i_node]}'] << self.featureconverter_test[label][i_node].outputs['feat_out'])
              +                                    self.links_Combat1_test[label].append(self.ComBat.inputs['features_test'][f'{label}_{self.featurecalculators[label][i_node]}'] << self.featureconverter_test[label][i_node].outputs['feat_out'])
                                                   self.links_Combat1_test[label][i_node].collapse = 'test'
               
                                       # -----------------------------------------------------
              @@ -957,11 +999,11 @@ 

              Source code for WORC.WORC

               
                                       for i_node, fname in enumerate(self.featurecalculators[label]):
                                           # Create sink for feature outputs
              -                            self.sinks_features_train[label].append(self.network.create_sink('HDF5', id='features_train_' + label + '_' + fname))
              +                            self.sinks_features_train[label].append(self.network.create_sink('HDF5', id='features_train_' + label + '_' + fname, step_id='train_sinks'))
               
                                           # Append features to the classification
                                           if not self.configs[0]['General']['ComBat'] == 'True':
              -                                self.links_C1_train[label].append(self.classify.inputs['features_train'][f'{label}_{self.featurecalculators[label][i_node]}'] << self.featureconverter_train[label][i_node].outputs['feat_out'])
              +                                self.links_C1_train[label].append(self.classify.inputs['features_train'][f'{label}_{self.featurecalculators[label][i_node]}'] << self.featureconverter_train[label][i_node].outputs['feat_out'])
                                               self.links_C1_train[label][i_node].collapse = 'train'
               
                                           # Save output
              @@ -970,11 +1012,11 @@ 

              Source code for WORC.WORC

                                           # Similar for testing workflow
                                           if self.TrainTest:
                                               # Create sink for feature outputs
              -                                self.sinks_features_test[label].append(self.network.create_sink('HDF5', id='features_test_' + label + '_' + fname))
              +                                self.sinks_features_test[label].append(self.network.create_sink('HDF5', id='features_test_' + label + '_' + fname, step_id='test_sinks'))
               
                                               # Append features to the classification
                                               if not self.configs[0]['General']['ComBat'] == 'True':
              -                                    self.links_C1_test[label].append(self.classify.inputs['features_test'][f'{label}_{self.featurecalculators[label][i_node]}'] << self.featureconverter_test[label][i_node].outputs['feat_out'])
              +                                    self.links_C1_test[label].append(self.classify.inputs['features_test'][f'{label}_{self.featurecalculators[label][i_node]}'] << self.featureconverter_test[label][i_node].outputs['feat_out'])
                                                   self.links_C1_test[label][i_node].collapse = 'test'
               
                                               # Save output
              @@ -1001,7 +1043,7 @@ 

              Source code for WORC.WORC

                                       self.modlabels.append(label)
               
                                       # Create a node for the feature computation
              -                        self.sources_features_train[label] = self.network.create_source('HDF5', id='features_train_' + label, node_group='train')
              +                        self.sources_features_train[label] = self.network.create_source('HDF5', id='features_train_' + label, node_group='train', step_id='train_sources')
               
                                       # Add the features from this modality to the classifier node input
                                       self.links_C1_train[label] = self.classify.inputs['features_train'][str(label)] << self.sources_features_train[label].output
              @@ -1009,7 +1051,7 @@ 

              Source code for WORC.WORC

               
                                       if self.features_test:
                                           # Create a node for the feature computation
              -                            self.sources_features_test[label] = self.network.create_source('HDF5', id='features_test_' + label, node_group='test')
              +                            self.sources_features_test[label] = self.network.create_source('HDF5', id='features_test_' + label, node_group='test', step_id='test_sources')
               
                                           # Add the features from this modality to the classifier node input
                                           self.links_C1_test[label] = self.classify.inputs['features_test'][str(label)] << self.sources_features_test[label].output
              @@ -1030,10 +1072,11 @@ 

              Source code for WORC.WORC

                           self.network.create_node('combat/ComBat:1.0',
                                                    tool_version='1.0',
                                                    id='ComBat',
              -                                     resources=ResourceLimit(memory=memory))
              +                                     resources=ResourceLimit(memory=memory),
              +                                     step_id='ComBat')
               
                       # Create sink for ComBat output
              -        self.sinks_features_train_ComBat = self.network.create_sink('HDF5', id='features_train_ComBat')
              +        self.sinks_features_train_ComBat = self.network.create_sink('HDF5', id='features_train_ComBat', step_id='ComBat')
               
                       # Create links for inputs
                       self.link_combat_1 = self.network.create_link(self.source_class_config.output, self.ComBat.inputs['config'])
              @@ -1050,7 +1093,7 @@ 

              Source code for WORC.WORC

               
                       if self.TrainTest:
                           # Create sink for ComBat output
              -            self.sinks_features_test_ComBat = self.network.create_sink('HDF5', id='features_test_ComBat')
              +            self.sinks_features_test_ComBat = self.network.create_sink('HDF5', id='features_test_ComBat', step_id='ComBat')
               
                           # Create links for inputs
                           self.link_combat_3 = self.network.create_link(self.source_patientclass_test.output, self.ComBat.inputs['patientclass_test'])
              @@ -1064,9 +1107,9 @@ 

              Source code for WORC.WORC

               
              [docs] def add_preprocessing(self, preprocess_node, label, nmod): """Add nodes required for preprocessing of images.""" memory = self.fastr_memory_parameters['Preprocessing'] - self.preprocessing_train[label] = self.network.create_node(preprocess_node, tool_version='1.0', id='preprocessing_train_' + label, resources=ResourceLimit(memory=memory)) + self.preprocessing_train[label] = self.network.create_node(preprocess_node, tool_version='1.0', id='preprocessing_train_' + label, resources=ResourceLimit(memory=memory), step_id='Preprocessing') if self.TrainTest: - self.preprocessing_test[label] = self.network.create_node(preprocess_node, tool_version='1.0', id='preprocessing_test_' + label, resources=ResourceLimit(memory=memory)) + self.preprocessing_test[label] = self.network.create_node(preprocess_node, tool_version='1.0', id='preprocessing_test_' + label, resources=ResourceLimit(memory=memory), step_id='Preprocessing') # Create required links self.preprocessing_train[label].inputs['parameters'] = self.sources_parameters[label].output @@ -1084,11 +1127,11 @@

              Source code for WORC.WORC

               
                       # If there are masks to use in normalization, add them here
                       if self.masks_normalize_train:
              -            self.sources_masks_normalize_train[label] = self.network.create_source('ITKImageFile', id='masks_normalize_train_' + label, node_group='train')
              +            self.sources_masks_normalize_train[label] = self.network.create_source('ITKImageFile', id='masks_normalize_train_' + label, node_group='train', step_id='Preprocessing')
                           self.preprocessing_train[label].inputs['mask'] = self.sources_masks_normalize_train[label].output
               
                       if self.masks_normalize_test:
              -            self.sources_masks_normalize_test[label] = self.network.create_source('ITKImageFile', id='masks_normalize_test_' + label, node_group='test')
              +            self.sources_masks_normalize_test[label] = self.network.create_source('ITKImageFile', id='masks_normalize_test_' + label, node_group='test', step_id='Preprocessing')
                           self.preprocessing_test[label].inputs['mask'] = self.sources_masks_normalize_test[label].output
              [docs] def add_feature_calculator(self, calcfeat_node, label, nmod): @@ -1103,14 +1146,16 @@

              Source code for WORC.WORC

                           self.network.create_node(calcfeat_node,
                                                    tool_version='1.0',
                                                    id='calcfeatures_train_' + node_ID,
              -                                     resources=ResourceLimit(memory=memory))
              +                                     resources=ResourceLimit(memory=memory),
              +                                     step_id='Feature_Extraction')
               
                       if self.TrainTest:
                           node_test =\
                               self.network.create_node(calcfeat_node,
                                                        tool_version='1.0',
                                                        id='calcfeatures_test_' + node_ID,
              -                                         resources=ResourceLimit(memory=memory))
              +                                         resources=ResourceLimit(memory=memory),
              +                                         step_id='Feature_Extraction')
               
                       # Check if we need to add pyradiomics specific sources
                       if 'pyradiomics' in calcfeat_node.lower():
              @@ -1118,14 +1163,16 @@ 

              Source code for WORC.WORC

                           self.source_config_pyradiomics[label] =\
                               self.network.create_source('YamlFile',
                                                          id='config_pyradiomics_' + label,
              -                                           node_group='train')
              +                                           node_group='train',
              +                                           step_id='Feature_Extraction')
               
                           # Add a format source, which we are going to set to a constant
                           # And attach to the tool node
                           self.source_format_pyradiomics =\
                               self.network.create_constant('String', 'csv',
                                                            id='format_pyradiomics_' + label,
              -                                             node_group='train')
              +                                             node_group='train',
              +                                             step_id='Feature_Extraction')
                           node_train.inputs['format'] =\
                               self.source_format_pyradiomics.output
               
              @@ -1170,7 +1217,8 @@ 

              Source code for WORC.WORC

                           if self.semantics_train and len(self.semantics_train) >= nmod + 1:
                               self.sources_semantics_train[label] =\
                                   self.network.create_source('CSVFile',
              -                                               id='semantics_train_' + label)
              +                                               id='semantics_train_' + label,
              +                                               step_id='train_sources')
               
                               node_train.inputs['semantics'] =\
                                   self.sources_semantics_train[label].output
              @@ -1178,7 +1226,8 @@ 

              Source code for WORC.WORC

                           if self.semantics_test and len(self.semantics_test) >= nmod + 1:
                               self.sources_semantics_test[label] =\
                                   self.network.create_source('CSVFile',
              -                                               id='semantics_test_' + label)
              +                                               id='semantics_test_' + label,
              +                                               step_id='test_sources')
                               node_test.inputs['semantics'] =\
                                   self.sources_semantics_test[label].output
               
              @@ -1187,7 +1236,8 @@ 

              Source code for WORC.WORC

                           self.network.create_node('worc/FeatureConverter:1.0',
                                                    tool_version='1.0',
                                                    id='featureconverter_train_' + node_ID,
              -                                     resources=ResourceLimit(memory='4G'))
              +                                     resources=ResourceLimit(memory='4G'),
              +                                     step_id='Feature_Extraction')
               
                       conv_train.inputs['feat_in'] = node_train.outputs['features']
               
              @@ -1197,12 +1247,13 @@ 

              Source code for WORC.WORC

                       elif 'predict' in calcfeat_node.lower():
                           toolbox = 'PREDICT'
                       else:
              -            message = f'Toolbox {calcfeat_node} not recognized!'
              +            message = f'Toolbox {calcfeat_node} not recognized!'
                           raise WORCexceptions.WORCKeyError(message)
               
                       self.source_toolbox_name[label] =\
                           self.network.create_constant('String', toolbox,
              -                                         id=f'toolbox_name_{toolbox}_{label}')
              +                                         id=f'toolbox_name_{toolbox}_{label}',
              +                                         step_id='Feature_Extraction')
               
                       conv_train.inputs['toolbox'] = self.source_toolbox_name[label].output
                       conv_train.inputs['config'] = self.sources_parameters[label].output
              @@ -1212,7 +1263,8 @@ 

              Source code for WORC.WORC

                               self.network.create_node('worc/FeatureConverter:1.0',
                                                        tool_version='1.0',
                                                        id='featureconverter_test_' + node_ID,
              -                                         resources=ResourceLimit(memory='4G'))
              +                                         resources=ResourceLimit(memory='4G'),
              +                                         step_id='Feature_Extraction')
               
                           conv_test.inputs['feat_in'] = node_test.outputs['features']
                           conv_test.inputs['toolbox'] = self.source_toolbox_name[label].output
              @@ -1260,13 +1312,15 @@ 

              Source code for WORC.WORC

                           self.sources_segmentations_train[label] =\
                               self.network.create_source('ITKImageFile',
                                                          id='segmentations_train_' + label,
              -                                           node_group='input')
              +                                           node_group='input',
              +                                           step_id='train_sources')
               
                           self.converters_seg_train[label] =\
                               self.network.create_node('worc/WORCCastConvert:0.3.2',
                                                        tool_version='0.1',
                                                        id='convert_seg_train_' + label,
              -                                         resources=ResourceLimit(memory=memory))
              +                                         resources=ResourceLimit(memory=memory),
              +                                         step_id='FileConversion')
               
                           self.converters_seg_train[label].inputs['image'] =\
                               self.sources_segmentations_train[label].output
              @@ -1275,13 +1329,15 @@ 

              Source code for WORC.WORC

                               self.sources_segmentations_test[label] =\
                                   self.network.create_source('ITKImageFile',
                                                              id='segmentations_test_' + label,
              -                                               node_group='input')
              +                                               node_group='input',
              +                                               step_id='test_sources')
               
                               self.converters_seg_test[label] =\
                                   self.network.create_node('worc/WORCCastConvert:0.3.2',
                                                            tool_version='0.1',
                                                            id='convert_seg_test_' + label,
              -                                             resources=ResourceLimit(memory=memory))
              +                                             resources=ResourceLimit(memory=memory),
              +                                             step_id='FileConversion')
               
                               self.converters_seg_test[label].inputs['image'] =\
                                   self.sources_segmentations_test[label].output
              @@ -1301,39 +1357,45 @@ 

              Source code for WORC.WORC

                               self.network.create_node(elastix_node,
                                                        tool_version='0.2',
                                                        id='elastix_train_' + label,
              -                                         resources=ResourceLimit(memory=memory_elastix))
              +                                         resources=ResourceLimit(memory=memory_elastix),
              +                                         step_id='Image_Registration')
               
                           memory_transformix = self.fastr_memory_parameters['Elastix']
                           self.transformix_seg_nodes_train[label] =\
                               self.network.create_node(transformix_node,
                                                        tool_version='0.2',
                                                        id='transformix_seg_train_' + label,
              -                                         resources=ResourceLimit(memory=memory_transformix))
              +                                         resources=ResourceLimit(memory=memory_transformix),
              +                                         step_id='Image_Registration')
               
                           self.transformix_im_nodes_train[label] =\
                               self.network.create_node(transformix_node,
                                                        tool_version='0.2',
                                                        id='transformix_im_train_' + label,
              -                                         resources=ResourceLimit(memory=memory_transformix))
              +                                         resources=ResourceLimit(memory=memory_transformix),
              +                                         step_id='Image_Registration')
               
                           if self.TrainTest:
                               self.elastix_nodes_test[label] =\
                                   self.network.create_node(elastix_node,
                                                            tool_version='0.2',
                                                            id='elastix_test_' + label,
              -                                             resources=ResourceLimit(memory=memory_elastix))
              +                                             resources=ResourceLimit(memory=memory_elastix),
              +                                             step_id='Image_Registration')
               
                               self.transformix_seg_nodes_test[label] =\
                                   self.network.create_node(transformix_node,
                                                            tool_version='0.2',
                                                            id='transformix_seg_test_' + label,
              -                                             resources=ResourceLimit(memory=memory_transformix))
              +                                             resources=ResourceLimit(memory=memory_transformix),
              +                                             step_id='Image_Registration')
               
                               self.transformix_im_nodes_test[label] =\
                                   self.network.create_node(transformix_node,
                                                            tool_version='0.2',
                                                            id='transformix_im_test_' + label,
              -                                             resources=ResourceLimit(memory=memory_transformix))
              +                                             resources=ResourceLimit(memory=memory_transformix),
              +                                             step_id='Image_Registration')
               
                           # Create sources_segmentation
                           # M1 = moving, others = fixed
              @@ -1355,7 +1417,8 @@ 

              Source code for WORC.WORC

                                   self.copymetadata_nodes_train[self.modlabels[0]] =\
                                       self.network.create_node('itktools/0.3.2/CopyMetadata:1.0',
                                                                tool_version='1.0',
              -                                                 id='CopyMetadata_train_' + self.modlabels[0])
              +                                                 id='CopyMetadata_train_' + self.modlabels[0],
              +                                                 step_id='Image_Registration')
               
                                   self.copymetadata_nodes_train[self.modlabels[0]].inputs["source"] =\
                                       self.converters_im_train[self.modlabels[0]].outputs['image']
              @@ -1385,7 +1448,8 @@ 

              Source code for WORC.WORC

                                       self.copymetadata_nodes_test[self.modlabels[0]] =\
                                           self.network.create_node('itktools/0.3.2/CopyMetadata:1.0',
                                                                    tool_version='1.0',
              -                                                     id='CopyMetadata_test_' + self.modlabels[0])
              +                                                     id='CopyMetadata_test_' + self.modlabels[0],
              +                                                     step_id='Image_Registration')
               
                                       self.copymetadata_nodes_test[self.modlabels[0]].inputs["source"] =\
                                           self.converters_im_test[self.modlabels[0]].outputs['image']
              @@ -1403,7 +1467,8 @@ 

              Source code for WORC.WORC

                           self.source_Elastix_Parameters[label] =\
                               self.network.create_source('ElastixParameterFile',
                                                          id='Elastix_Para_' + label,
              -                                           node_group='elpara')
              +                                           node_group='elpara',
              +                                           step_id='Image_Registration')
               
                           self.link_elparam_train =\
                               self.network.create_link(self.source_Elastix_Parameters[label].output,
              @@ -1436,7 +1501,8 @@ 

              Source code for WORC.WORC

                           self.edittransformfile_nodes_train[label] =\
                               self.network.create_node('elastixtools/EditElastixTransformFile:0.1',
                                                        tool_version='0.1',
              -                                         id='EditElastixTransformFile' + label)
              +                                         id='EditElastixTransformFile_train_' + label,
              +                                         step_id='Image_Registration')
               
                           self.edittransformfile_nodes_train[label].inputs['set'] =\
                               ["FinalBSplineInterpolationOrder=0"]
              @@ -1448,7 +1514,8 @@ 

              Source code for WORC.WORC

                               self.edittransformfile_nodes_test[label] =\
                                   self.network.create_node('elastixtools/EditElastixTransformFile:0.1',
                                                            tool_version='0.1',
              -                                             id='EditElastixTransformFile' + label)
              +                                             id='EditElastixTransformFile_test_' + label,
              +                                             step_id='Image_Registration')
               
                               self.edittransformfile_nodes_test[label].inputs['set'] =\
                                   ["FinalBSplineInterpolationOrder=0"]
              @@ -1476,25 +1543,30 @@ 

              Source code for WORC.WORC

                               self.transformix_im_nodes_test[label].inputs['image'] =\
                                   self.converters_im_test[self.modlabels[0]].outputs['image']
               
              -            for i_node in range(len(self.calcfeatures_train[label])):
              -                self.calcfeatures_train[label][i_node].inputs['segmentation'] =\
              -                    self.transformix_seg_nodes_train[label].outputs['image']
              -                if self.TrainTest:
              -                    self.calcfeatures_test[label][i_node].inputs['segmentation'] =\
              -                        self.transformix_seg_nodes_test[label].outputs['image']
              +            if self.configs[nmod]['General']['Segmentix'] != 'True':
              +                # These segmentations serve as input for the feature calculation
              +                for i_node in range(len(self.calcfeatures_train[label])):
              +                    self.calcfeatures_train[label][i_node].inputs['segmentation'] =\
              +                        self.transformix_seg_nodes_train[label].outputs['image']
              +                    if self.TrainTest:
              +                        self.calcfeatures_test[label][i_node].inputs['segmentation'] =\
              +                            self.transformix_seg_nodes_test[label].outputs['image']
               
                           # Save outputfor the training set
                           self.sinks_transformations_train[label] =\
                               self.network.create_sink('ElastixTransformFile',
              -                                         id='transformations_train_' + label)
              +                                         id='transformations_train_' + label,
              +                                         step_id='train_sinks')
               
                           self.sinks_segmentations_elastix_train[label] =\
                               self.network.create_sink('ITKImageFile',
              -                                         id='segmentations_out_elastix_train_' + label)
              +                                         id='segmentations_out_elastix_train_' + label,
              +                                         step_id='train_sinks')
               
                           self.sinks_images_elastix_train[label] =\
                               self.network.create_sink('ITKImageFile',
              -                                         id='images_out_elastix_train_' + label)
              +                                         id='images_out_elastix_train_' + label,
              +                                         step_id='train_sinks')
               
                           self.sinks_transformations_train[label].input =\
                               self.elastix_nodes_train[label].outputs['transform']
              @@ -1509,14 +1581,18 @@ 

              Source code for WORC.WORC

                           if self.TrainTest:
                               self.sinks_transformations_test[label] =\
                                   self.network.create_sink('ElastixTransformFile',
              -                                             id='transformations_test_' + label)
              +                                             id='transformations_test_' + label,
              +                                             step_id='test_sinks')
               
                               self.sinks_segmentations_elastix_test[label] =\
                                   self.network.create_sink('ITKImageFile',
              -                                             id='segmentations_out_elastix_test_' + label)
              +                                             id='segmentations_out_elastix_test_' + label,
              +                                             step_id='test_sinks')
                               self.sinks_images_elastix_test[label] =\
              -                    self.network.create_sink('ITKImageFile', id='images_out_elastix_test_' + label)
              -                self.sinks_transformations_elastix_test[label].input =\
              +                    self.network.create_sink('ITKImageFile',
              +                                             id='images_out_elastix_test_' + label,
              +                                             step_id='test_sinks')
              +                self.sinks_transformations_test[label].input =\
                                   self.elastix_nodes_test[label].outputs['transform']
                               self.sinks_segmentations_elastix_test[label].input =\
                                   self.transformix_seg_nodes_test[label].outputs['image']
              @@ -1531,14 +1607,16 @@ 

              Source code for WORC.WORC

                       if label not in self.sinks_segmentations_segmentix_train:
                           self.sinks_segmentations_segmentix_train[label] =\
                               self.network.create_sink('ITKImageFile',
              -                                         id='segmentations_out_segmentix_train_' + label)
              +                                         id='segmentations_out_segmentix_train_' + label,
              +                                         step_id='train_sinks')
               
                       memory = self.fastr_memory_parameters['Segmentix']
                       self.nodes_segmentix_train[label] =\
                           self.network.create_node('segmentix/Segmentix:1.0',
                                                    tool_version='1.0',
                                                    id='segmentix_train_' + label,
              -                                     resources=ResourceLimit(memory=memory))
              +                                     resources=ResourceLimit(memory=memory),
              +                                     step_id='Preprocessing')
               
                       # Input the image
                       self.nodes_segmentix_train[label].inputs['image'] =\
              @@ -1568,11 +1646,15 @@ 

              Source code for WORC.WORC

                       if self.TrainTest:
                           self.sinks_segmentations_segmentix_test[label] =\
                               self.network.create_sink('ITKImageFile',
              -                                         id='segmentations_out_segmentix_test_' + label)
              +                                         id='segmentations_out_segmentix_test_' + label,
              +                                         step_id='test_sinks')
              +
                           self.nodes_segmentix_test[label] =\
                               self.network.create_node('segmentix/Segmentix:1.0',
                                                        tool_version='1.0',
              -                                         id='segmentix_test_' + label, resources=ResourceLimit(memory=memory))
              +                                         id='segmentix_test_' + label,
              +                                         resources=ResourceLimit(memory=memory),
              +                                         step_id='Preprocessing')
               
                           self.nodes_segmentix_test[label].inputs['image'] =\
                               self.converters_im_test[label].outputs['image']
              @@ -1730,14 +1812,16 @@ 

              Source code for WORC.WORC

                           self.network.draw(file_path=self.network.id + '.svg', draw_dimensions=True)
                       except graphviz.backend.ExecutableNotFound:
                           print('[WORC WARNING] Graphviz executable not found: not drawing network diagram. Make sure the Graphviz executables are on your systems PATH.')
              +        except graphviz.backend.CalledProcessError as e:
              +            print(f'[WORC WARNING] Graphviz executable gave an error: not drawing network diagram. Original error: {e}')
               
                       if DebugDetector().do_detection():
                           print("Source Data:")
                           for k in self.source_data.keys():
              -                print(f"\t {k}: {self.source_data[k]}.")
              +                print(f"\t {k}: {self.source_data[k]}.")
                           print("\n Sink Data:")
                           for k in self.sink_data.keys():
              -                print(f"\t {k}: {self.sink_data[k]}.")
              +                print(f"\t {k}: {self.sink_data[k]}.")
               
                           # When debugging, set the tempdir to the default of fastr + name
                           self.fastr_tmpdir = os.path.join(fastr.config.mounts['tmp'],
              @@ -1774,7 +1858,7 @@ 

              Source code for WORC.WORC

                               config = configparser.ConfigParser()
                               config.read(c)
                               c = config
              -            cfile = os.path.join(self.fastr_tmpdir, f"config_{self.name}_{num}.ini")
              +            cfile = os.path.join(self.fastr_tmpdir, f"config_{self.name}_{num}.ini")
                           if not os.path.exists(os.path.dirname(cfile)):
                               os.makedirs(os.path.dirname(cfile))
                           with open(cfile, 'w') as configfile:
              @@ -1782,15 +1866,15 @@ 

              Source code for WORC.WORC

               
                           # If PyRadiomics is used, also write a config for PyRadiomics
                           if 'pyradiomics' in c['General']['FeatureCalculators']:
              -                cfile_pyradiomics = os.path.join(self.fastr_tmpdir, f"config_pyradiomics_{self.name}_{num}.yaml")
              +                cfile_pyradiomics = os.path.join(self.fastr_tmpdir, f"config_pyradiomics_{self.name}_{num}.yaml")
                               config_pyradiomics = io.convert_config_pyradiomics(c)
                               with open(cfile_pyradiomics, 'w') as file:
                                   yaml.safe_dump(config_pyradiomics, file)
              -                cfile_pyradiomics = Path(self.fastr_tmpdir) / f"config_pyradiomics_{self.name}_{num}.yaml"
              +                cfile_pyradiomics = Path(self.fastr_tmpdir) / f"config_pyradiomics_{self.name}_{num}.yaml"
                               self.pyradiomics_configs.append(cfile_pyradiomics.as_uri().replace('%20', ' '))
               
                           # BUG: Make path with pathlib to create windows double slashes
              -            cfile = Path(self.fastr_tmpdir) / f"config_{self.name}_{num}.ini"
              +            cfile = Path(self.fastr_tmpdir) / f"config_{self.name}_{num}.ini"
                           self.fastrconfigs.append(cfile.as_uri().replace('%20', ' '))
              @@ -1804,7 +1888,7 @@

              Source code for WORC.WORC

                   3. Slicer pipeline, to create pngs of middle slice of images.
                   """
               
              -
              [docs] def __init__(self): +
              [docs] def __init__(self): """Initialize object with all pipelines.""" self.Elastix = Elastix() self.Evaluate = Evaluate() @@ -1821,11 +1905,19 @@

              Source code for WORC.WORC

               
                 

              + © Copyright 2016 -- 2020, Biomedical Imaging Group Rotterdam, Departments of Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands

              - Built with Sphinx using a theme provided by Read the Docs. + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. @@ -1837,7 +1929,6 @@

              Source code for WORC.WORC

                 
              - + - - - - + + + + - - - - - @@ -49,17 +50,20 @@ - WORC + WORC + + +
              - 3.2.2 + 3.3.0
              @@ -76,6 +80,7 @@
              + +
              @@ -139,7 +146,7 @@
                -
              • Docs »
              • +
              • »
              • Module code »
              • @@ -294,11 +301,19 @@

                Source code for WORC.addexceptions

                 
                   

                + © Copyright 2016 -- 2020, Biomedical Imaging Group Rotterdam, Departments of Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands

                - Built with Sphinx using a theme provided by Read the Docs. + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. @@ -310,7 +325,6 @@

                Source code for WORC.addexceptions

                   
                - + - - - - + + + + - - - - - @@ -49,17 +50,20 @@ - WORC + WORC + + +
                - 3.2.2 + 3.3.0
                @@ -76,6 +80,7 @@
                + +
              @@ -139,7 +146,7 @@
                -
              • Docs »
              • +
              • »
              • Module code »
              • @@ -176,18 +183,18 @@

                Source code for WORC.classification.AdvancedSampler

                # See the License for the specific language governing permissions and # limitations under the License. -from sklearn.utils import check_random_state +from sklearn.utils import check_random_state import numpy as np -from sklearn.externals import six -from ghalton import Halton +import six +from ghalton import Halton # from sobol_seq import i4_sobol_generate as Sobol import scipy -from scipy.stats import uniform +from scipy.stats import uniform import math
                [docs]class log_uniform(): -
                [docs] def __init__(self, loc=-1, scale=0, base=10): +
                [docs] def __init__(self, loc=-1, scale=0, base=10): self.loc = loc self.scale = scale self.base = base @@ -201,7 +208,7 @@

                Source code for WORC.classification.AdvancedSampler

                [docs]class discrete_uniform(): -
                [docs] def __init__(self, loc=-1, scale=0): +
                [docs] def __init__(self, loc=-1, scale=0): self.loc = loc self.scale = scale self.uniform_dist = uniform(loc=self.loc, scale=self.scale)
                @@ -221,7 +228,7 @@

                Source code for WORC.classification.AdvancedSampler

                object returns strings. ''' -
                [docs] def __init__(self, loc=0, scale=1, threshold=0.5): +
                [docs] def __init__(self, loc=0, scale=1, threshold=0.5): self.loc = loc self.scale = scale self.threshold = threshold @@ -235,7 +242,7 @@

                Source code for WORC.classification.AdvancedSampler

                [docs]class exp_uniform(): -
                [docs] def __init__(self, loc=-1, scale=0, base=math.e): +
                [docs] def __init__(self, loc=-1, scale=0, base=math.e): self.loc = loc self.scale = scale self.base = base
                @@ -306,7 +313,7 @@

                Source code for WORC.classification.AdvancedSampler

                ... {'b': 1.038159, 'a': 2}] True """ -
                [docs] def __init__(self, param_distributions, n_iter, random_state=None, +
                [docs] def __init__(self, param_distributions, n_iter, random_state=None, method='Halton'): self.param_distributions = param_distributions self.n_iter = n_iter @@ -316,7 +323,7 @@

                Source code for WORC.classification.AdvancedSampler

                if method == 'Halton': self.Halton = Halton(len(self.param_distributions.keys()))
                -
                [docs] def __iter__(self): +
                [docs] def __iter__(self): # Create a random state to be used rnd = check_random_state(self.random_state) @@ -349,7 +356,7 @@

                Source code for WORC.classification.AdvancedSampler

                if self.method == 'Halton': self.Halton.reset()
                -
                [docs] def __len__(self): +
                [docs] def __len__(self): """Number of points that will be sampled.""" return self.n_iter
                @@ -387,11 +394,19 @@

                Source code for WORC.classification.AdvancedSampler

                + © Copyright 2016 -- 2020, Biomedical Imaging Group Rotterdam, Departments of Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands

                - Built with Sphinx using a theme provided by Read the Docs. + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. @@ -403,7 +418,6 @@

                Source code for WORC.classification.AdvancedSampler

                jQuery(function () { SphinxRtdTheme.Navigation.enable(true); diff --git a/WORC/doc/_build/html/_modules/WORC/classification/ObjectSampler.html b/WORC/doc/_build/html/_modules/WORC/classification/ObjectSampler.html index 48c077f1..6123a469 100644 --- a/WORC/doc/_build/html/_modules/WORC/classification/ObjectSampler.html +++ b/WORC/doc/_build/html/_modules/WORC/classification/ObjectSampler.html @@ -1,23 +1,29 @@ - - + - WORC.classification.ObjectSampler — WORC 3.1.3 documentation + WORC.classification.ObjectSampler — WORC 3.3.0 documentation + + + + + - + @@ -29,11 +35,6 @@ - - - - - @@ -49,17 +50,20 @@ - WORC + WORC + + +
                - 3.1.3 + 3.3.0
                @@ -76,6 +80,7 @@
                + +
                @@ -139,7 +146,7 @@
                  -
                • Docs »
                • +
                • »
                • Module code »
                • @@ -161,7 +168,7 @@

                  Source code for WORC.classification.ObjectSampler

                   #!/usr/bin/env python
                   
                  -# Copyright 2016-2019 Biomedical Imaging Group Rotterdam, Departments of
                  +# Copyright 2016-2020 Biomedical Imaging Group Rotterdam, Departments of
                   # Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
                   #
                   # Licensed under the Apache License, Version 2.0 (the "License");
                  @@ -191,79 +198,159 @@ 

                  Source code for WORC.classification.ObjectSampler

                  He, Haibo, and Edwardo A. Garcia. "Learning from imbalanced data." IEEE Transactions on Knowledge & Data Engineering 9 (2008): 1263-1284. - """ +
                  [docs] def __init__(self, method, sampling_strategy='auto', - SMOTE_ratio=1, - SMOTE_neighbors=5, n_jobs=1, n_neighbors=3, - ): - + k_neighbors=5, + threshold_cleaning=0.5, + verbose=True): + """Initialize object.""" # Initialize a random state self.random_seed = np.random.randint(5000) - self.random_state = check_random_state(random_seed) + self.random_state = check_random_state(self.random_seed) # Initialize all objects as Nones: overriden when required by functions - self.sampling_strategy = None self.object = None - self.n_neighbors = None + self.sampling_strategy = None self.n_jobs = None + self.n_neighbors = None + self.k_neighbors = None + self.threshold_cleaning = None + self.verbose = verbose if method == 'RandomUnderSampling': self.init_RandomUnderSampling(sampling_strategy) elif method == 'NearMiss': - self.init_NearMiss(sampling_strategy, n_neighbors, n_jobs) - elif method == 'NeigbourhoodCleaningRule': - self.init_NeigbourhoodCleaningRule() + self.init_NearMiss(sampling_strategy, n_jobs) + elif method == 'NeighbourhoodCleaningRule': + self.init_NeighbourhoodCleaningRule(sampling_strategy, n_neighbors, + n_jobs, threshold_cleaning) elif method == 'RandomOverSampling': self.init_RandomOverSampling(sampling_strategy) elif method == 'ADASYN': - self.init_ADASYN() + self.init_ADASYN(sampling_strategy, n_neighbors, n_jobs) elif method == 'BorderlineSMOTE': - self.init_BorderlineSMOTE() + self.init_BorderlineSMOTE(k_neighbors, n_jobs) + elif method == 'SMOTE': + self.init_SMOTE(k_neighbors, n_jobs) elif method == 'SMOTEENN': - self.init_SMOTEENN() + self.init_SMOTEENN(sampling_strategy) elif method == 'SMOTETomek': - self.init_SMOTETomek() + self.init_SMOTETomek(sampling_strategy) else: raise ae.WORCKeyError(f'{method} is not a valid sampling method!')
                  [docs] def init_RandomUnderSampling(self, sampling_strategy): + """Creata a random under sampler object.""" self.object = under_sampling.RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=self.random_state) self.sampling_strategy = sampling_strategy
                  -
                  [docs] def init_NearMiss(self, sampling_strategy, n_neighbors, n_jobs): +
                  [docs] def init_NearMiss(self, sampling_strategy, n_jobs): + """Creata a near miss sampler object.""" self.object = under_sampling.NearMiss(sampling_strategy=sampling_strategy, - random_state=self.random_state, - n_neighbors=n_neighbors, n_jobs=n_jobs) self.sampling_strategy = sampling_strategy - self.n_neighbors = n_neighbors self.n_jobs = n_jobs
                  +
                  [docs] def init_NeighbourhoodCleaningRule(self, sampling_strategy, n_neighbors, + n_jobs, threshold_cleaning): + """Creata a NeighbourhoodCleaningRule sampler object.""" + self.object =\ + under_sampling.NeighbourhoodCleaningRule(sampling_strategy=sampling_strategy, + threshold_cleaning=threshold_cleaning, + n_jobs=n_jobs) + + self.sampling_strategy = sampling_strategy + self.n_neighbors = n_neighbors + self.n_jobs = n_jobs + self.threshold_cleaning = threshold_cleaning
                  +
                  [docs] def init_RandomOverSampling(self, sampling_strategy): + """Creata a random over sampler object.""" self.object = over_sampling.RandomOverSampler(sampling_strategy=sampling_strategy, random_state=self.random_state) self.sampling_strategy = sampling_strategy
                  -
                  [docs] def init_SMOTE(self): - sm = SMOTE(random_state=None, - ratio=para_estimator['SampleProcessing_SMOTE_ratio'], - m_neighbors=para_estimator['SampleProcessing_SMOTE_neighbors'], - kind='borderline1', - n_jobs=para_estimator['SampleProcessing_SMOTE_n_cores']) +
                  [docs] def init_ADASYN(self, sampling_strategy, n_neighbors, n_jobs): + """Creata a ADASYN sampler object.""" + self.object = over_sampling.ADASYN(sampling_strategy=sampling_strategy, + random_state=self.random_state, + n_neighbors=n_neighbors, + n_jobs=n_jobs) - self.object = sm
                  + self.sampling_strategy = sampling_strategy + self.n_neighbors = n_neighbors + self.n_jobs = n_jobs
                  + +
                  [docs] def init_BorderlineSMOTE(self, k_neighbors, n_jobs): + """Creata a BorderlineSMOTE sampler object.""" + self.object =\ + over_sampling.BorderlineSMOTE(random_state=self.random_state, + k_neighbors=k_neighbors, + n_jobs=n_jobs) + + self.k_neighbors = k_neighbors + self.n_jobs = n_jobs
                  + +
                  [docs] def init_SMOTE(self, k_neighbors, n_jobs): + """Creata a SMOTE sampler object.""" + self.object =\ + over_sampling.SMOTE(random_state=self.random_state, + k_neighbors=k_neighbors, + n_jobs=n_jobs) + + self.k_neighbors = k_neighbors + self.n_jobs = n_jobs
                  + +
                  [docs] def init_SMOTEENN(self, sampling_strategy): + """Creata a SMOTEEN sampler object.""" + self.object =\ + combine.SMOTEENN(random_state=self.random_state, + sampling_strategy=sampling_strategy) + + self.sampling_strategy = sampling_strategy
                  - def fit(self, **kwargs): - self.object.fit(**kwargs) +
                  [docs] def init_SMOTETomek(self, sampling_strategy): + """Creata a SMOTE Tomek sampler object.""" + self.object =\ + combine.SMOTETomek(random_state=self.random_state, + sampling_strategy=sampling_strategy) -
                  [docs] def fit(self, **kwargs): - self.object.fit(**kwargs)
                  + self.sampling_strategy = sampling_strategy
                  + +
                  [docs] def fit(self, *args, **kwargs): + """Fit a sampler object.""" + if hasattr(self.object, 'fit_resample'): + if self.verbose: + print('[WORC WARNING] Sampler does have fit_resample construction: not fitting now.') + else: + # Object has a fit-transform construction + self.object.fit(*args, **kwargs)
                  + +
                  [docs] def transform(self, *args, **kwargs): + """Transform objects with a fitted sampler.""" + if hasattr(self.object, 'fit_resample'): + if self.verbose: + print('[WORC WARNING] Sampler does have fit_resample construction: fit and resampling.') + try: + return self.object.fit_resample(*args, **kwargs) + except ValueError as message: + message = str(message) + message = 'The ObjectSampler could not ' +\ + 'resample the objects with ' +\ + 'the given parameters. ' +\ + 'Probably your number of samples ' +\ + 'is too small for the parameters ' +\ + 'used. Original error: ' + message + raise ae.WORCValueError(message) + + else: + return self.object.transform(*args, **kwargs)
                @@ -276,11 +363,19 @@

                Source code for WORC.classification.ObjectSampler

                - Built with Sphinx using a theme provided by Read the Docs. + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. @@ -292,7 +387,6 @@

                Source code for WORC.classification.ObjectSampler

                - + - - - - + + + + - - - - - @@ -49,17 +50,20 @@ - WORC + WORC + + +
                - 3.2.2 + 3.3.0
                @@ -76,6 +80,7 @@
                + +
                @@ -139,7 +146,7 @@
                  -
                • Docs »
                • +
                • »
                • Module code »
                • @@ -177,11 +184,11 @@

                  Source code for WORC.classification.RankedSVM

                  # limitations under the License. -from __future__ import division +from __future__ import division import numpy as np -from scipy.optimize import linprog -from scipy.optimize import fminbound -from scipy import linalg +from scipy.optimize import linprog +from scipy.optimize import fminbound +from scipy import linalg import operator import WORC.addexceptions as WORCexceptions @@ -925,11 +932,19 @@

                  Source code for WORC.classification.RankedSVM

                  + © Copyright 2016 -- 2020, Biomedical Imaging Group Rotterdam, Departments of Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands

                  - Built with Sphinx using a theme provided by Read the Docs. + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. @@ -941,7 +956,6 @@

                  Source code for WORC.classification.RankedSVM

                  - + - - - - + + + + - - - - - @@ -49,17 +50,20 @@ - WORC + WORC + + +
                  - 3.2.2 + 3.3.0
                  @@ -76,6 +80,7 @@
                  + +
                  @@ -139,7 +146,7 @@
                    -
                  • Docs »
                  • +
                  • »
                  • Module code »
                  • @@ -177,47 +184,53 @@

                    Source code for WORC.classification.SearchCV

                    # limitations under the License.
                     
                     
                    -from sklearn.base import BaseEstimator, is_classifier, clone
                    -from sklearn.base import MetaEstimatorMixin
                    -from sklearn.exceptions import NotFittedError
                    -from sklearn.utils.metaestimators import if_delegate_has_method
                    -from sklearn.utils.validation import indexable, check_is_fitted
                    -from WORC.classification.metrics import check_scoring
                    -from sklearn.model_selection._split import check_cv
                    -from scipy.stats import rankdata
                    -from sklearn.externals import six
                    -from sklearn.utils.fixes import MaskedArray
                    +from sklearn.base import BaseEstimator, is_classifier, clone
                    +from sklearn.base import MetaEstimatorMixin
                    +from sklearn.exceptions import NotFittedError
                    +from sklearn.utils.metaestimators import if_delegate_has_method
                    +from sklearn.utils.validation import indexable, check_is_fitted
                    +from WORC.classification.metrics import check_scoring
                    +from sklearn.model_selection._split import check_cv
                    +from scipy.stats import rankdata
                    +import six
                    +from sklearn.utils.fixes import MaskedArray
                     
                    -from sklearn.model_selection._search import ParameterSampler
                    -from sklearn.model_selection._search import ParameterGrid, _check_param_grid
                    -from sklearn.preprocessing import StandardScaler
                    +from sklearn.model_selection._search import ParameterSampler
                    +from sklearn.model_selection._search import ParameterGrid, _check_param_grid
                    +from sklearn.preprocessing import StandardScaler
                     
                    -from abc import ABCMeta, abstractmethod
                    -from collections import Sized, defaultdict
                    +from abc import ABCMeta, abstractmethod
                    +from collections import Sized, defaultdict
                     import numpy as np
                    -from functools import partial
                    +from functools import partial
                     import warnings
                     
                     import os
                     import random
                     import string
                     import fastr
                    -from fastr.api import ResourceLimit
                    -from joblib import Parallel, delayed
                    -from WORC.classification.fitandscore import fit_and_score, replacenan
                    -from WORC.classification.fitandscore import delete_nonestimator_parameters
                    +from fastr.api import ResourceLimit
                    +from joblib import Parallel, delayed
                    +from WORC.classification.fitandscore import fit_and_score, replacenan
                    +from WORC.classification.fitandscore import delete_nonestimator_parameters
                    +from sklearn.utils.validation import _check_fit_params
                    +from sklearn.utils.validation import _num_samples
                    +from sklearn.model_selection._validation import _aggregate_score_dicts
                    +from WORC.classification.metrics import check_multimetric_scoring
                    +from sklearn.metrics._scorer import _MultimetricScorer
                     import WORC.addexceptions as WORCexceptions
                     import pandas as pd
                     import json
                     import glob
                    -from itertools import islice
                    +from itertools import islice
                     import shutil
                    -from sklearn.metrics import f1_score, roc_auc_score, mean_squared_error
                    -from sklearn.metrics import accuracy_score
                    -from sklearn.multiclass import OneVsRestClassifier
                    -from WORC.classification.estimators import RankedSVM
                    -from WORC.classification import construct_classifier as cc
                    -from WORC.featureprocessing.Preprocessor import Preprocessor
                    +from sklearn.metrics import f1_score, roc_auc_score, mean_squared_error
                    +from sklearn.metrics import accuracy_score
                    +from sklearn.multiclass import OneVsRestClassifier
                    +from WORC.classification.estimators import RankedSVM
                    +from WORC.classification import construct_classifier as cc
                    +from WORC.featureprocessing.Preprocessor import Preprocessor
                    +from WORC.detectors.detectors import DebugDetector
                     
                     
                     
                    [docs]def rms_score(truth, prediction): @@ -259,7 +272,7 @@

                    Source code for WORC.classification.SearchCV

                    MetaEstimatorMixin)):
                         """Ensemble of BaseSearchCV Estimators."""
                         # @abstractmethod
                    -
                    [docs] def __init__(self, estimators): +
                    [docs] def __init__(self, estimators): if not estimators: message = 'You supplied an empty list of estimators: No ensemble creation possible.' raise WORCexceptions.WORCValueError(message) @@ -515,7 +528,7 @@

                    Source code for WORC.classification.SearchCV

                    MetaEstimatorMixin)):
                         """Base class for hyper parameter search with cross-validation."""
                     
                    [docs] @abstractmethod - def __init__(self, param_distributions={}, n_iter=10, scoring=None, + def __init__(self, param_distributions={}, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True, @@ -727,10 +740,19 @@

                    Source code for WORC.classification.SearchCV

                    return self.best_estimator_.transform(Xt)
                    [docs] def preprocess(self, X, y=None, training=False): - '''Apply the available preprocssing methods to the features''' + """Apply the available preprocssing methods to the features.""" if self.best_preprocessor is not None: X = self.best_preprocessor.transform(X) + if self.best_imputer is not None: + X = self.best_imputer.transform(X) + + # Replace nan if still left + X = replacenan(np.asarray(X)).tolist() + + if self.best_groupsel is not None: + X = self.best_groupsel.transform(X) + if not training and hasattr(self, 'overfit_scaler') and self.overfit_scaler: # Overfit the feature scaling on the test set # NOTE: Never use this in an actual model, only to assess how @@ -747,15 +769,6 @@

                    Source code for WORC.classification.SearchCV

                    if self.best_scaler is not None:
                                     X = self.best_scaler.transform(X)
                     
                    -        if self.best_imputer is not None:
                    -            X = self.best_imputer.transform(X)
                    -
                    -        # Replace nan if still left
                    -        X = replacenan(np.asarray(X)).tolist()
                    -
                    -        if self.best_groupsel is not None:
                    -            X = self.best_groupsel.transform(X)
                    -
                             if self.best_varsel is not None:
                                 X = self.best_varsel.transform(X)
                     
                    @@ -771,40 +784,34 @@ 

                    Source code for WORC.classification.SearchCV

                    if self.best_statisticalsel is not None:
                                 X = self.best_statisticalsel.transform(X)
                     
                    -        # Only oversample in training phase, i.e. if we have the labels
                    +        # Only resampling in training phase, i.e. if we have the labels
                             if y is not None:
                    -            if self.best_SMOTE is not None:
                    -                X, y = self.best_SMOTE.fit_sample(X, y)
                    -
                    -            if self.best_RandomOverSampler is not None:
                    -                X, y = self.best_RandomOverSampler.fit_sample(X, y)
                    +            if self.best_Sampler is not None:
                    +                X, y = self.best_Sampler.transform(X, y)
                     
                             return X, y
                    - @property - def best_params_(self): - check_is_fitted(self, 'cv_results_') - return self.cv_results_['params_all'][self.best_index_] - - @property - def best_score_(self): - check_is_fitted(self, 'cv_results_') - return self.cv_results_['mean_test_score'][self.best_index_] - -
                    [docs] def process_fit(self, n_splits, parameters_est, parameters_all, - test_sample_counts, test_scores, - train_scores, fit_time, score_time, cv_iter, +
                    [docs] def process_fit(self, n_splits, parameters_all, + test_sample_counts, test_score_dicts, + train_score_dicts, fit_time, score_time, cv_iter, X, y): """ Process the outcomes of a SearchCV fit and find the best settings over all cross validations from all hyperparameters tested + Very similar to the _format_results function or the original SearchCV. + """ + # test_score_dicts and train_score dicts are lists of dictionaries and + # we make them into dict of lists + test_scores = _aggregate_score_dicts(test_score_dicts) + if self.return_train_score: + train_scores = _aggregate_score_dicts(train_score_dicts) + # We take only one result per split, default by sklearn - candidate_params_est = list(parameters_est[::n_splits]) candidate_params_all = list(parameters_all[::n_splits]) - n_candidates = len(candidate_params_est) + n_candidates = len(candidate_params_all) # Computed the (weighted) mean and std for test scores alone # NOTE test_sample counts (weights) remain the same for all candidates @@ -826,7 +833,7 @@

                    Source code for WORC.classification.SearchCV

                    try:
                                     array_means = np.average(array, axis=1, weights=weights)
                                 except ZeroDivisionError as e:
                    -                e = f'[WORC Warning] {e}. Setting {key_name} to unweighted.'
                    +                e = f'[WORC Warning] {e}. Setting {key_name} to unweighted.'
                                     print(e)
                                     array_means = np.average(array, axis=1)
                     
                    @@ -841,7 +848,7 @@ 

                    Source code for WORC.classification.SearchCV

                    array_means[:, np.newaxis]) ** 2,
                                                                     axis=1, weights=weights))
                                 except ZeroDivisionError as e:
                    -                e = f'[WORC Warning] {e}. Setting {key_name} to unweighted.'
                    +                e = f'[WORC Warning] {e}. Setting {key_name} to unweighted.'
                                     print(e)
                                     array_stds = np.sqrt(np.average((array -
                                                                      array_means[:, np.newaxis]) ** 2,
                    @@ -853,13 +860,37 @@ 

                    Source code for WORC.classification.SearchCV

                    results["rank_%s" % key_name] = np.asarray(
                                         rankdata(-array_means, method='min'), dtype=np.int32)
                     
                    -        _store('test_score', test_scores, splits=True, rank=True,
                    -               weights=test_sample_counts if self.iid else None)
                    -        if self.return_train_score:
                    -            _store('train_score', train_scores, splits=True)
                             _store('fit_time', fit_time)
                             _store('score_time', score_time)
                     
                    +        # Store scores
                    +        # Check whether to do multimetric scoring
                    +        test_estimator = cc.construct_classifier(candidate_params_all[0])
                    +        scorers, self.multimetric_ = check_multimetric_scoring(
                    +            test_estimator, scoring=self.scoring)
                    +
                    +        # NOTE test_sample counts (weights) remain the same for all candidates
                    +        test_sample_counts = np.array(test_sample_counts[:n_splits],
                    +                                      dtype=np.int)
                    +
                    +        if self.iid != 'deprecated':
                    +            warnings.warn(
                    +                "The parameter 'iid' is deprecated in 0.22 and will be "
                    +                "removed in 0.24.", FutureWarning
                    +            )
                    +            iid = self.iid
                    +        else:
                    +            iid = False
                    +
                    +        for scorer_name in scorers.keys():
                    +            # Computed the (weighted) mean and std for test scores alone
                    +            _store('test_%s' % scorer_name, test_scores[scorer_name],
                    +                   splits=True, rank=True,
                    +                   weights=test_sample_counts if iid else None)
                    +            if self.return_train_score:
                    +                _store('train_%s' % scorer_name, train_scores[scorer_name],
                    +                       splits=True)
                    +
                             # Compute the "Generalization" score
                             difference_score = abs(results['mean_train_score'] - results['mean_test_score'])
                             generalization_score = results['mean_test_score'] - difference_score
                    @@ -867,6 +898,45 @@ 

                    Source code for WORC.classification.SearchCV

                    results['rank_generalization_score'] = np.asarray(
                                 rankdata(-results['generalization_score'], method='min'), dtype=np.int32)
                     
                    +        if self.multimetric_:
                    +            if self.refit is not False and (
                    +                    not isinstance(self.refit, str) or
                    +                    # This will work for both dict / list (tuple)
                    +                    self.refit not in scorers) and not callable(self.refit):
                    +                raise ValueError("For multi-metric scoring, the parameter "
                    +                                 "refit must be set to a scorer key or a "
                    +                                 "callable to refit an estimator with the "
                    +                                 "best parameter setting on the whole "
                    +                                 "data and make the best_* attributes "
                    +                                 "available for that metric. If this is "
                    +                                 "not needed, refit should be set to "
                    +                                 "False explicitly. %r was passed."
                    +                                 % self.refit)
                    +            else:
                    +                refit_metric = self.refit
                    +        else:
                    +            refit_metric = 'score'
                    +
                    +        # For multi-metric evaluation, store the best_index_, best_params_ and
                    +        # best_score_ iff refit is one of the scorer names
                    +        # In single metric evaluation, refit_metric is "score"
                    +        if self.refit or not self.multimetric_:
                    +            # If callable, refit is expected to return the index of the best
                    +            # parameter set.
                    +            if callable(self.refit):
                    +                self.best_index_ = self.refit(results)
                    +                if not isinstance(self.best_index_, numbers.Integral):
                    +                    raise TypeError('best_index_ returned is not an integer')
                    +                if (self.best_index_ < 0 or
                    +                   self.best_index_ >= len(results["params"])):
                    +                    raise IndexError('best_index_ index out of range')
                    +            else:
                    +                self.best_index_ = results["rank_test_%s"
                    +                                           % refit_metric].argmin()
                    +                self.best_score_ = results["mean_test_%s" % refit_metric][
                    +                                           self.best_index_]
                    +            self.best_params_ = candidate_params_all[self.best_index_]
                    +
                             # Rank the indices of scores from all parameter settings
                             ranked_test_scores = results["rank_" + self.ranking_score]
                             indices = range(0, len(ranked_test_scores))
                    @@ -877,48 +947,35 @@ 

                    Source code for WORC.classification.SearchCV

                    maxlen = min(self.maxlen, n_candidates)
                             bestindices = sortedindices[0:maxlen]
                     
                    -        candidate_params_est = np.asarray(candidate_params_est)[bestindices].tolist()
                             candidate_params_all = np.asarray(candidate_params_all)[bestindices].tolist()
                             for k in results.keys():
                                 results[k] = results[k][bestindices]
                    -        n_candidates = len(candidate_params_est)
                    +        n_candidates = len(candidate_params_all)
                    +        results['params'] = candidate_params_all
                     
                             # Store the atributes of the best performing estimator
                             best_index = np.flatnonzero(results["rank_" + self.ranking_score] == 1)[0]
                    -        best_parameters_est = candidate_params_est[best_index]
                             best_parameters_all = candidate_params_all[best_index]
                     
                    -        # Use one MaskedArray and mask all the places where the param is not
                    -        # applicable for that candidate. Use defaultdict as each candidate may
                    -        # not contain all the params
                    -        param_results = defaultdict(partial(MaskedArray,
                    -                                            np.empty(n_candidates,),
                    -                                            mask=True,
                    -                                            dtype=object))
                    -        for cand_i, params in enumerate(candidate_params_all):
                    -            for name, value in params.items():
                    -                # An all masked empty array gets created for the key
                    -                # `"param_%s" % name` at the first occurence of `name`.
                    -                # Setting the value at an index also unmasks that index
                    -                param_results["param_%s" % name][cand_i] = value
                    -
                    -        # Store a list of param dicts at the key 'params'
                    -        results['params'] = candidate_params_est
                    -        results['params_all'] = candidate_params_all
                    -
                    +        # Store several objects
                             self.cv_results_ = results
                    -        self.best_index_ = best_index
                             self.n_splits_ = n_splits
                             self.cv_iter = cv_iter
                    +        self.best_index_ = best_index
                    +        self.best_params_ = results["params"][self.best_index_]
                    +
                    +        if self.refit:
                    +            # We always refit on the full dataset
                    +            indices = np.arange(0, len(y))
                    +            self.refit_and_score(X, y, best_parameters_all,
                    +                                 train=indices, test=indices)
                     
                    -        # Refit all objects with best settings on the full dataset
                    -        indices = range(0, len(y))
                    -        self.refit_and_score(X, y, best_parameters_all, best_parameters_est,
                    -                             train=indices, test=indices)
                    +        # Store the only scorer not as a dict for single metric evaluation
                    +        self.scorer_ = scorers if self.multimetric_ else scorers['score']
                     
                             return self
                    -
                    [docs] def refit_and_score(self, X, y, parameters_all, parameters_est, +
                    [docs] def refit_and_score(self, X, y, parameters_all, train, test, verbose=None): """Refit the base estimator and attributes such as GroupSel @@ -936,9 +993,6 @@

                    Source code for WORC.classification.SearchCV

                                    and the fitting. TODO: Create a default object and show the
                                     fields.
                     
                    -        parameters_est: dictionary, mandatory
                    -                Contains the settings used for the base estimator
                    -
                             train: list, mandatory
                                     Indices of the objects to be used as training set.
                     
                    @@ -970,19 +1024,21 @@ 

                    Source code for WORC.classification.SearchCV

                    preprocessor = None
                     
                             # Refit all preprocessing functions
                    +        fit_params = _check_fit_params(X, self.fit_params)
                             out = fit_and_score(X_fit, y, self.scoring,
                                                 train, test, parameters_all,
                    -                            fit_params=self.fit_params,
                    +                            fit_params=fit_params,
                                                 return_train_score=self.return_train_score,
                                                 return_n_test_samples=True,
                    -                            return_times=True, return_parameters=True,
                    +                            return_times=True, return_parameters=False,
                    +                            return_estimator=False,
                                                 error_score=self.error_score,
                                                 verbose=verbose,
                                                 return_all=True)
                     
                             # Associate best options with new fits
                             (save_data, GroupSel, VarSel, SelectModel, feature_labels, scalers,\
                    -            Imputers, PCAs, StatisticalSel, ReliefSel, sm, ros) = out
                    +            Imputers, PCAs, StatisticalSel, ReliefSel, Sampler) = out
                             self.best_groupsel = GroupSel
                             self.best_scaler = scalers
                             self.best_varsel = VarSel
                    @@ -993,14 +1049,12 @@ 

                    Source code for WORC.classification.SearchCV

                    self.best_featlab = feature_labels
                             self.best_statisticalsel = StatisticalSel
                             self.best_reliefsel = ReliefSel
                    -        self.best_SMOTE = sm
                    -        self.best_RandomOverSampler = ros
                    +        self.best_Sampler = Sampler
                     
                             # Fit the estimator using the preprocessed features
                             X = [x[0] for x in X]
                             X, y = self.preprocess(X, y, training=True)
                     
                    -        parameters_est = delete_nonestimator_parameters(parameters_est)
                             best_estimator = cc.construct_classifier(parameters_all)
                     
                             # NOTE: This just has to go to the construct classifier function,
                    @@ -1078,9 +1132,8 @@ 

                    Source code for WORC.classification.SearchCV

                    scoring = self.scoring
                     
                             # Get settings for best 100 estimators
                    -        parameters_est = self.cv_results_['params']
                    -        parameters_all = self.cv_results_['params_all']
                    -        n_classifiers = len(parameters_est)
                    +        parameters_all = self.cv_results_['params']
                    +        n_classifiers = len(parameters_all)
                             n_iter = len(self.cv_iter)
                     
                             # Create a new base object for the ensemble components
                    @@ -1092,7 +1145,7 @@ 

                    Source code for WORC.classification.SearchCV

                    if type(method) is int:
                                 # Simply take the top50 best hyperparameters
                                 if verbose:
                    -                print(f'Creating ensemble using top {str(method)} individual classifiers.')
                    +                print(f'Creating ensemble using top {str(method)} individual classifiers.')
                                 if method == 1:
                                     # Next functions expect list
                                     ensemble = [0]
                    @@ -1113,20 +1166,20 @@ 

                    Source code for WORC.classification.SearchCV

                    performances = np.zeros((n_iter, n_classifiers))
                                 for it, (train, valid) in enumerate(self.cv_iter):
                                     if verbose:
                    -                    print(f' - iteration {it + 1} / {n_iter}.')
                    +                    print(f' - iteration {it + 1} / {n_iter}.')
                                     Y_valid_score_it = np.zeros((n_classifiers, len(valid)))
                     
                                     # Loop over the 100 best estimators
                    -                for num, (p_est, p_all) in enumerate(zip(parameters_est, parameters_all)):
                    +                for num, p_all in enumerate(parameters_all):
                                         # NOTE: Explicitly exclude validation set, elso refit and score
                                         # somehow still seems to use it.
                                         X_train_temp = [X_train[i] for i in train]
                                         Y_train_temp = [Y_train[i] for i in train]
                    -                    train_temp = range(0, len(train))
                    +                    train_temp = np.arange(0, len(train))
                     
                                         # Refit a SearchCV object with the provided parameters
                                         base_estimator.refit_and_score(X_train_temp, Y_train_temp, p_all,
                    -                                                   p_est, train_temp, train_temp,
                    +                                                   train_temp, train_temp,
                                                                        verbose=False)
                     
                                         # Predict and save scores
                    @@ -1221,9 +1274,9 @@ 

                    Source code for WORC.classification.SearchCV

                    best_performance = new_performance
                     
                                     # Print the performance gain
                    -                print(f"Ensembling best {scoring}: {best_performance}.")
                    -                print(f"Single estimator best {scoring}: {single_estimator_performance}.")
                    -                print(f'Ensemble consists of {len(ensemble)} estimators {ensemble}.')
                    +                print(f"Ensembling best {scoring}: {best_performance}.")
                    +                print(f"Single estimator best {scoring}: {single_estimator_performance}.")
                    +                print(f'Ensemble consists of {len(ensemble)} estimators {ensemble}.')
                     
                             elif method == 'Caruana':
                                 # Use the method from Caruana
                    @@ -1231,11 +1284,9 @@ 

                    Source code for WORC.classification.SearchCV

                    print('Creating ensemble with Caruana method.')
                     
                                 # BUG: kernel parameter is sometimes saved in unicode
                    -            for i in range(0, len(parameters_est)):
                    -                kernel = str(parameters_est[i][u'kernel'])
                    -                del parameters_est[i][u'kernel']
                    +            for i in range(0, len(parameters_all)):
                    +                kernel = str(parameters_all[i][u'kernel'])
                                     del parameters_all[i][u'kernel']
                    -                parameters_est[i]['kernel'] = kernel
                                     parameters_all[i]['kernel'] = kernel
                     
                                 # In order to speed up the process, we precompute all scores of the possible
                    @@ -1249,20 +1300,20 @@ 

                    Source code for WORC.classification.SearchCV

                    performances = np.zeros((n_iter, n_classifiers))
                                 for it, (train, valid) in enumerate(self.cv_iter):
                                     if verbose:
                    -                    print(f' - iteration {it + 1} / {n_iter}.')
                    +                    print(f' - iteration {it + 1} / {n_iter}.')
                                     Y_valid_score_it = np.zeros((n_classifiers, len(valid)))
                     
                                     # Loop over the 100 best estimators
                    -                for num, (p_est, p_all) in enumerate(zip(parameters_est, parameters_all)):
                    +                for num, p_all in enumerate(parameters_all):
                                         # NOTE: Explicitly exclude validation set, elso refit and score
                                         # somehow still seems to use it.
                                         X_train_temp = [X_train[i] for i in train]
                                         Y_train_temp = [Y_train[i] for i in train]
                    -                    train_temp = range(0, len(train))
                    +                    train_temp = np.arange(0, len(train))
                     
                                         # Refit a SearchCV object with the provided parameters
                                         base_estimator.refit_and_score(X_train_temp, Y_train_temp, p_all,
                    -                                                   p_est, train_temp, train_temp,
                    +                                                   train_temp, train_temp,
                                                                        verbose=False)
                     
                                         # Predict and save scores
                    @@ -1357,9 +1408,9 @@ 

                    Source code for WORC.classification.SearchCV

                    best_performance = new_performance
                     
                                     # Print the performance gain
                    -                print(f"Ensembling best {scoring}: {best_performance}.")
                    -                print(f"Single estimator best {scoring}: {single_estimator_performance}.")
                    -                print(f'Ensemble consists of {len(ensemble)} estimators {ensemble}.')
                    +                print(f"Ensembling best {scoring}: {best_performance}.")
                    +                print(f"Single estimator best {scoring}: {single_estimator_performance}.")
                    +                print(f'Ensemble consists of {len(ensemble)} estimators {ensemble}.')
                     
                                 # Greedy selection  -----------------------------------------------
                                 # Initialize variables
                    @@ -1373,7 +1424,7 @@ 

                    Source code for WORC.classification.SearchCV

                    while new_performance > best_performance:
                                     # Score is better, so expand ensemble and replace new best score
                                     if verbose:
                    -                    print(f"Iteration: {iteration}, best {scoring}: {new_performance}.")
                    +                    print(f"Iteration: {iteration}, best {scoring}: {new_performance}.")
                                     best_performance = new_performance
                     
                                     if iteration > 1:
                    @@ -1419,23 +1470,22 @@ 

                    Source code for WORC.classification.SearchCV

                    iteration += 1
                     
                                 # Print the performance gain
                    -            print(f"Ensembling best {scoring}: {best_performance}.")
                    -            print(f"Single estimator best {scoring}: {single_estimator_performance}.")
                    -            print(f'Ensemble consists of {len(ensemble)} estimators {ensemble}.')
                    +            print(f"Ensembling best {scoring}: {best_performance}.")
                    +            print(f"Single estimator best {scoring}: {single_estimator_performance}.")
                    +            print(f'Ensemble consists of {len(ensemble)} estimators {ensemble}.')
                             else:
                    -            print(f'[WORC WARNING] No valid ensemble method given: {method}. Not ensembling')
                    +            print(f'[WORC WARNING] No valid ensemble method given: {method}. Not ensembling')
                                 return self
                     
                             # Create the ensemble --------------------------------------------------
                             # Create the ensemble trained on the full training set
                    -        parameters_est = [parameters_est[i] for i in ensemble]
                             parameters_all = [parameters_all[i] for i in ensemble]
                             estimators = list()
                    -        train = range(0, len(X_train))
                    +        train = np.arange(0, len(X_train))
                             nest = len(ensemble)
                    -        for enum, (p_est, p_all) in enumerate(zip(parameters_est, parameters_all)):
                    +        for enum, p_all in enumerate(parameters_all):
                                 # Refit a SearchCV object with the provided parameters
                    -            print(f"Refitting estimator {enum+1} / {nest}.")
                    +            print(f"Refitting estimator {enum+1} / {nest}.")
                                 base_estimator = clone(base_estimator)
                     
                                 # # Check if we need to create a multiclass estimator
                    @@ -1444,7 +1494,7 @@ 

                    Source code for WORC.classification.SearchCV

                    #     base_estimator = OneVsRestClassifier(base_estimator)
                     
                                 base_estimator.refit_and_score(X_train, Y_train, p_all,
                    -                                           p_est, train, train,
                    +                                           train, train,
                                                                verbose=False)
                     
                                 # Determine whether to overfit the feature scaling on the test set
                    @@ -1467,16 +1517,37 @@ 

                    Source code for WORC.classification.SearchCV

                    isclassifier =\
                                 not any(clf in regressors for clf in self.param_distributions['classifiers'])
                     
                    +        # Check the cross-validation object and do the splitting
                             cv = check_cv(self.cv, y, classifier=isclassifier)
                     
                             X, y, groups = indexable(X, y, groups)
                             n_splits = cv.get_n_splits(X, y, groups)
                             if self.verbose > 0 and isinstance(parameter_iterable, Sized):
                                 n_candidates = len(parameter_iterable)
                    -            print(f"Fitting {n_splits} folds for each of {n_candidates} candidates, totalling {n_candidates * n_splits} fits.")
                    +            print(f"Fitting {n_splits} folds for each of {n_candidates} candidates, totalling {n_candidates * n_splits} fits.")
                     
                             cv_iter = list(cv.split(X, y, groups))
                    -        name = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(10))
                    +
                    +        # NOTE: We do not check the scoring here, as this can differ
                    +        # per estimator. Thus, this is done inside the fit and scoring
                    +
                    +        # Check fitting parameters
                    +        fit_params = _check_fit_params(X, self.fit_params)
                    +
                    +        # Create temporary directory for fastr
                    +        if DebugDetector().do_detection():
                    +            # Specific name for easy debugging
                    +            debugnum = 0
                    +            name = 'DEBUG_' + str(debugnum)
                    +            tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name)
                    +            while os.path.exists(tempfolder):
                    +                debugnum += 1
                    +                name = 'DEBUG_' + str(debugnum)
                    +                tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name)
                    +
                    +        else:
                    +            name = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(10))
                    +
                             tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name)
                             if not os.path.exists(tempfolder):
                                 os.makedirs(tempfolder)
                    @@ -1520,7 +1591,7 @@ 

                    Source code for WORC.classification.SearchCV

                    message = 'One or more of the values in your parameter sampler ' +\
                                           'is either not iterable, or the distribution cannot ' +\
                                           'generate valid samples. Please check your  ' +\
                    -                      f' parameters. At least {k} gives an error.'
                    +                      f' parameters. At least {k} gives an error.'
                                 raise WORCexceptions.WORCValueError(message)
                     
                             # Split the parameters files in equal parts
                    @@ -1532,7 +1603,7 @@ 

                    Source code for WORC.classification.SearchCV

                    for number in k:
                                     temp_dict[number] = parameters_temp[number]
                     
                    -            fname = f'settings_{num}.json'
                    +            fname = f'settings_{num}.json'
                                 sourcename = os.path.join(tempfolder, 'parameters', fname)
                                 if not os.path.exists(os.path.dirname(sourcename)):
                                     os.makedirs(os.path.dirname(sourcename))
                    @@ -1540,7 +1611,7 @@ 

                    Source code for WORC.classification.SearchCV

                    json.dump(temp_dict, fp, indent=4)
                     
                                 parameter_files[str(num)] =\
                    -                f'vfs://tmp/GS/{name}/parameters/{fname}'
                    +                f'vfs://tmp/GS/{name}/parameters/{fname}'
                     
                             # Create test-train splits
                             traintest_files = dict()
                    @@ -1553,13 +1624,13 @@ 

                    Source code for WORC.classification.SearchCV

                    index=source_labels,
                                                         name='Train-test data')
                     
                    -            fname = f'traintest_{num}.hdf5'
                    +            fname = f'traintest_{num}.hdf5'
                                 sourcename = os.path.join(tempfolder, 'traintest', fname)
                                 if not os.path.exists(os.path.dirname(sourcename)):
                                     os.makedirs(os.path.dirname(sourcename))
                    -            traintest_files[str(num)] = f'vfs://tmp/GS/{name}/traintest/{fname}'
                    +            traintest_files[str(num)] = f'vfs://tmp/GS/{name}/traintest/{fname}'
                     
                    -            sourcelabel = f"Source Data Iteration {num}"
                    +            sourcelabel = f"Source Data Iteration {num}"
                                 source_data.to_hdf(sourcename, sourcelabel)
                     
                                 num += 1
                    @@ -1569,12 +1640,20 @@ 

                    Source code for WORC.classification.SearchCV

                    'verbose', 'fit_params', 'return_train_score',
                                                 'return_n_test_samples',
                                                 'return_times', 'return_parameters',
                    +                            'return_estimator',
                                                 'error_score']
                     
                    +        verbose = False
                    +        return_n_test_samples = True
                    +        return_times = True
                    +        return_parameters = False
                    +        return_estimator = False
                             estimator_data = pd.Series([X, y, self.scoring,
                    -                                    False,
                    -                                    self.fit_params, self.return_train_score,
                    -                                    True, True, True,
                    +                                    verbose, fit_params,
                    +                                    self.return_train_score,
                    +                                    return_n_test_samples, return_times,
                    +                                    return_parameters,
                    +                                    return_estimator,
                                                         self.error_score],
                                                        index=estimator_labels,
                                                        name='estimator Data')
                    @@ -1582,7 +1661,7 @@ 

                    Source code for WORC.classification.SearchCV

                    estimatorname = os.path.join(tempfolder, fname)
                             estimator_data.to_hdf(estimatorname, 'Estimator Data')
                     
                    -        estimatordata = f"vfs://tmp/GS/{name}/{fname}"
                    +        estimatordata = f"vfs://tmp/GS/{name}/{fname}"
                     
                             # Create the fastr network
                             network = fastr.create_network('WORC_GridSearch_' + name)
                    @@ -1604,7 +1683,7 @@ 

                    Source code for WORC.classification.SearchCV

                    source_data = {'estimator_source': estimatordata,
                                            'traintest': traintest_files,
                                            'parameters': parameter_files}
                    -        sink_data = {'output': f"vfs://tmp/GS/{name}/output_{{sample_id}}_{{cardinality}}{{ext}}"}
                    +        sink_data = {'output': f"vfs://tmp/GS/{name}/output_{{sample_id}}_{{cardinality}}{{ext}}"}
                     
                             network.execute(source_data, sink_data,
                                             tmpdir=os.path.join(tempfolder, 'tmp'),
                    @@ -1617,12 +1696,12 @@ 

                    Source code for WORC.classification.SearchCV

                    difference = expected_no_files - len(sink_files)
                                 fname = os.path.join(tempfolder, 'tmp')
                                 message = ('Fitting classifiers has failed for ' +
                    -                       f'{difference} / {expected_no_files} files. The temporary ' +
                    -                       f'results where not deleted and can be found in {tempfolder}. ' +
                    +                       f'{difference} / {expected_no_files} files. The temporary ' +
                    +                       f'results where not deleted and can be found in {tempfolder}. ' +
                                            'Probably your fitting and scoring failed: check out ' +
                                            'the tmp/fitandscore folder within the tempfolder for ' +
                                            'the fastr job temporary results or run: fastr trace ' +
                    -                       f'"{fname}{os.path.sep}__sink_data__.json" --samples.')
                    +                       f'"{fname}{os.path.sep}__sink_data__.json" --samples.')
                                 raise WORCexceptions.WORCValueError(message)
                     
                             # Read in the output data once finished
                    @@ -1634,23 +1713,24 @@ 

                    Source code for WORC.classification.SearchCV

                    # if one choose to see train score, "out" will contain train score info
                             if self.return_train_score:
                                 (train_scores, test_scores, test_sample_counts,
                    -             fit_time, score_time, parameters_est, parameters_all) =\
                    +             fit_time, score_time, parameters_all) =\
                                   zip(*save_data)
                             else:
                                 (test_scores, test_sample_counts,
                    -             fit_time, score_time, parameters_est, parameters_all) =\
                    +             fit_time, score_time, parameters_all) =\
                                   zip(*save_data)
                     
                             # Remove the temporary folder used
                    -        shutil.rmtree(tempfolder)
                    +        if name != 'DEBUG_0':
                    +            # Do delete if not debugging for first iteration
                    +            shutil.rmtree(tempfolder)
                     
                             # Process the results of the fitting procedure
                             self.process_fit(n_splits=n_splits,
                    -                         parameters_est=parameters_est,
                                              parameters_all=parameters_all,
                                              test_sample_counts=test_sample_counts,
                    -                         test_scores=test_scores,
                    -                         train_scores=train_scores,
                    +                         test_score_dicts=test_scores,
                    +                         train_score_dicts=train_scores,
                                              fit_time=fit_time,
                                              score_time=score_time,
                                              cv_iter=cv_iter,
                    @@ -1866,7 +1946,7 @@ 

                    Source code for WORC.classification.SearchCV

                        """
                     
                    -
                    [docs] def __init__(self, param_distributions={}, n_iter=10, scoring=None, +
                    [docs] def __init__(self, param_distributions={}, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True, @@ -1914,19 +1994,23 @@

                    Source code for WORC.classification.SearchCV

                    isclassifier =\
                                 not any(clf in regressors for clf in self.param_distributions['classifiers'])
                     
                    +        # Check the cross-validation object and do the splitting
                             cv = check_cv(self.cv, y, classifier=isclassifier)
                     
                             X, y, groups = indexable(X, y, groups)
                             n_splits = cv.get_n_splits(X, y, groups)
                             if self.verbose > 0 and isinstance(parameter_iterable, Sized):
                                 n_candidates = len(parameter_iterable)
                    -            print(f"Fitting {n_splits} folds for each of {n_candidates}" +\
                    +            print(f"Fitting {n_splits} folds for each of {n_candidates}" +\
                                       " candidates, totalling" +\
                                       " {n_candidates * n_splits} fits")
                     
                             pre_dispatch = self.pre_dispatch
                             cv_iter = list(cv.split(X, y, groups))
                     
                    +        # Check fitting parameters
                    +        fit_params = _check_fit_params(X, self.fit_params)
                    +
                             # Draw parameter sample
                             for num, parameters in enumerate(parameter_iterable):
                                 parameter_sample = parameters
                    @@ -1949,10 +2033,11 @@ 

                    Source code for WORC.classification.SearchCV

                    pre_dispatch=pre_dispatch
                             )(delayed(fit_and_score)(X, y, self.scoring,
                                                      train, test, parameters,
                    -                                 fit_params=self.fit_params,
                    +                                 fit_params=fit_params,
                                                      return_train_score=self.return_train_score,
                                                      return_n_test_samples=True,
                    -                                 return_times=True, return_parameters=True,
                    +                                 return_times=True, return_parameters=False,
                    +                                 return_estimator=False,
                                                      error_score=self.error_score,
                                                      verbose=False,
                                                      return_all=False)
                    @@ -1963,19 +2048,18 @@ 

                    Source code for WORC.classification.SearchCV

                    # if one choose to see train score, "out" will contain train score info
                             if self.return_train_score:
                                 (train_scores, test_scores, test_sample_counts,
                    -             fit_time, score_time, parameters_est, parameters_all) =\
                    +             fit_time, score_time, parameters_all) =\
                                   save_data
                             else:
                                 (test_scores, test_sample_counts,
                    -             fit_time, score_time, parameters_est, parameters_all) =\
                    +             fit_time, score_time, parameters_all) =\
                                   save_data
                     
                             self.process_fit(n_splits=n_splits,
                    -                         parameters_est=parameters_est,
                                              parameters_all=parameters_all,
                                              test_sample_counts=test_sample_counts,
                    -                         test_scores=test_scores,
                    -                         train_scores=train_scores,
                    +                         test_score_dicts=test_scores,
                    +                         train_score_dicts=train_scores,
                                              fit_time=fit_time,
                                              score_time=score_time,
                                              cv_iter=cv_iter,
                    @@ -2214,7 +2298,7 @@ 

                    Source code for WORC.classification.SearchCV

                        """
                     
                    -
                    [docs] def __init__(self, estimator, param_grid, scoring=None, fit_params=None, +
                    [docs] def __init__(self, estimator, param_grid, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score='raise', return_train_score=True): @@ -2456,7 +2540,7 @@

                    Source code for WORC.classification.SearchCV

                        """
                     
                    -
                    [docs] def __init__(self, param_distributions={}, n_iter=10, scoring=None, +
                    [docs] def __init__(self, param_distributions={}, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True, @@ -2723,7 +2807,7 @@

                    Source code for WORC.classification.SearchCV

                        """
                     
                    -
                    [docs] def __init__(self, estimator, param_grid, scoring=None, fit_params=None, +
                    [docs] def __init__(self, estimator, param_grid, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score='raise', return_train_score=True): @@ -2766,11 +2850,19 @@

                    Source code for WORC.classification.SearchCV

                         

                    + © Copyright 2016 -- 2020, Biomedical Imaging Group Rotterdam, Departments of Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands

                    - Built with Sphinx using a theme provided by Read the Docs. + + + + Built with Sphinx using a + + theme + + provided by Read the Docs. @@ -2782,7 +2874,6 @@

                    Source code for WORC.classification.SearchCV

                       
                     
                    -
                       
                    +  
                       
                         
                           
                    -        
                    -        
                    -        
                    -        
                    +        
                    +        
                    +        
                    +        
                         
                         
                     
                         
                    -
                    -  
                    -  
                    -  
                    -  
                         
                          
                     
                    @@ -49,17 +50,20 @@
                               
                     
                               
                    -             WORC
                    +             WORC
                               
                     
                               
                    +            
                    +            
                    +          
                               
                     
                               
                                 
                                 
                                   
                    - 3.2.2 + 3.3.0
                    @@ -76,6 +80,7 @@
                    + +
                    @@ -139,7 +146,7 @@