rebase and fix flake

ravinkohli · ravinkohli · commit 4b063f9dcc7f · 2021-12-21T18:01:00.000+01:00
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -27,7 +27,7 @@
 
 import pandas as pd
 
-from smac.runhistory.runhistory import DataOrigin, RunHistory
+from smac.runhistory.runhistory import DataOrigin, RunHistory, RunInfo, RunValue
 from smac.stats.stats import Stats
 from smac.tae import StatusType
 
@@ -291,7 +291,10 @@ def _get_dataset_input_validator(
         y_train: Union[List, pd.DataFrame, np.ndarray],
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None,
+        resampling_strategy: Optional[Union[
+            CrossValTypes,
+            HoldoutValTypes,
+            NoResamplingStrategyTypes]] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
     ) -> Tuple[BaseDataset, BaseInputValidator]:
@@ -335,7 +338,10 @@ def get_dataset(
         y_train: Union[List, pd.DataFrame, np.ndarray],
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None,
+        resampling_strategy: Optional[Union[
+            CrossValTypes,
+            HoldoutValTypes,
+            NoResamplingStrategyTypes]] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
     ) -> BaseDataset:
@@ -593,18 +599,6 @@ def _load_models(self) -> bool:
             raise ValueError("Resampling strategy is needed to determine what models to load")
         self.ensemble_ = self._backend.load_ensemble(self.seed)
 
-        # TODO: remove this code after `fit_pipeline` is rebased.
-        if hasattr(self, '_disable_file_output'):
-            if isinstance(self._disable_file_output, List):
-                disabled_file_outputs = self._disable_file_output
-                disable_file_output = False
-            elif isinstance(self._disable_file_output, bool):
-                disable_file_output = self._disable_file_output
-                disabled_file_outputs = []
-        else:
-            disable_file_output = False
-            disabled_file_outputs = []
-
         # If no ensemble is loaded, try to get the best performing model
         if not self.ensemble_:
             self.ensemble_ = self._load_best_individual_model()
@@ -619,7 +613,7 @@ def _load_models(self) -> bool:
                 if len(self.cv_models_) == 0:
                     raise ValueError('No models fitted!')
 
-        elif disable_file_output or 'pipeline' not in disabled_file_outputs:
+        elif 'pipeline' not in self._disable_file_output:
             model_names = self._backend.list_all_models(self.seed)
 
             if len(model_names) == 0:
@@ -1395,7 +1389,10 @@ def fit_pipeline(
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         dataset_name: Optional[str] = None,
-        resampling_strategy: Optional[Union[HoldoutValTypes, CrossValTypes]] = None,
+        resampling_strategy: Optional[Union[
+            CrossValTypes,
+            HoldoutValTypes,
+            NoResamplingStrategyTypes]] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         run_time_limit_secs: int = 60,
         memory_limit: Optional[int] = None,
@@ -1511,7 +1508,6 @@ def fit_pipeline(
             (BaseDataset):
                 Dataset created from the given tensors
         """
-        self.dataset_name = dataset.dataset_name
 
         if dataset is None:
             if (
diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py
@@ -156,7 +156,10 @@ def _get_dataset_input_validator(
         y_train: Union[List, pd.DataFrame, np.ndarray],
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None,
+        resampling_strategy: Optional[Union[
+            CrossValTypes,
+            HoldoutValTypes,
+            NoResamplingStrategyTypes]] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
     ) -> Tuple[TabularDataset, TabularInputValidator]:
@@ -371,19 +374,6 @@ def search(
             self
 
         """
-        if dataset_name is None:
-            dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))
-
-        # we have to create a logger for at this point for the validator
-        self._logger = self._get_logger(dataset_name)
-
-        # Create a validator object to make sure that the data provided by
-        # the user matches the autopytorch requirements
-        self.InputValidator = TabularInputValidator(
-            is_classification=True,
-            logger_port=self._logger_port,
-        )
-
         self.dataset, self.InputValidator = self._get_dataset_input_validator(
             X_train=X_train,
             y_train=y_train,
@@ -401,9 +391,6 @@ def search(
                 '(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
             )
 
-        if self.dataset is None:
-            raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
-
         return self._search(
             dataset=self.dataset,
             optimize_metric=optimize_metric,
diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py
@@ -156,7 +156,10 @@ def _get_dataset_input_validator(
         y_train: Union[List, pd.DataFrame, np.ndarray],
         X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
         y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
-        resampling_strategy: Optional[Union[CrossValTypes, HoldoutValTypes]] = None,
+        resampling_strategy: Optional[Union[
+            CrossValTypes,
+            HoldoutValTypes,
+            NoResamplingStrategyTypes]] = None,
         resampling_strategy_args: Optional[Dict[str, Any]] = None,
         dataset_name: Optional[str] = None,
     ) -> Tuple[TabularDataset, TabularInputValidator]:
@@ -386,9 +389,6 @@ def search(
                 '(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
             )
 
-        if self.dataset is None:
-            raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))
-
         return self._search(
             dataset=self.dataset,
             optimize_metric=optimize_metric,
diff --git a/autoPyTorch/evaluation/fit_evaluator.py b/autoPyTorch/evaluation/fit_evaluator.py
@@ -16,6 +16,7 @@
     AbstractEvaluator,
     fit_and_suppress_warnings
 )
+from autoPyTorch.evaluation.utils import DisableFileOutputParameters
 from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
 from autoPyTorch.utils.common import subsampler
 from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates
@@ -33,7 +34,7 @@ def __init__(self, backend: Backend, queue: Queue,
                  num_run: Optional[int] = None,
                  include: Optional[Dict[str, Any]] = None,
                  exclude: Optional[Dict[str, Any]] = None,
-                 disable_file_output: Union[bool, List] = False,
+                 disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
                  init_params: Optional[Dict[str, Any]] = None,
                  logger_port: Optional[int] = None,
                  keep_models: Optional[bool] = None,
@@ -241,14 +242,11 @@ def file_output(
                 )
 
         # Abort if we don't want to output anything.
-        if hasattr(self, 'disable_file_output'):
-            if self.disable_file_output:
-                return None, {}
-            else:
-                self.disabled_file_outputs = []
+        if 'all' in self.disable_file_output:
+            return None, {}
 
-        if hasattr(self, 'pipeline') and self.pipeline is not None:
-            if 'pipeline' not in self.disabled_file_outputs:
+        if getattr(self, 'pipeline', None) is not None:
+            if 'pipeline' not in self.disable_file_output:
                 pipeline = self.pipeline
             else:
                 pipeline = None
@@ -265,11 +263,11 @@ def file_output(
             ensemble_predictions=None,
             valid_predictions=(
                 Y_valid_pred if 'y_valid' not in
-                                self.disabled_file_outputs else None
+                                self.disable_file_output else None
             ),
             test_predictions=(
                 Y_test_pred if 'y_test' not in
-                               self.disabled_file_outputs else None
+                               self.disable_file_output else None
             ),
         )
 
@@ -287,8 +285,8 @@ def eval_function(
     num_run: int,
     include: Optional[Dict[str, Any]],
     exclude: Optional[Dict[str, Any]],
-    disable_file_output: Union[bool, List],
     output_y_hat_optimization: bool = False,
+    disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
     pipeline_config: Optional[Dict[str, Any]] = None,
     budget_type: str = None,
     init_params: Optional[Dict[str, Any]] = None,
@@ -297,14 +295,75 @@ def eval_function(
     search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
     instance: str = None,
 ) -> None:
+    """
+    This closure allows the communication between the ExecuteTaFuncWithQueue and the
+    pipeline trainer (TrainEvaluator).
+
+    Fundamentally, smac calls the ExecuteTaFuncWithQueue.run() method, which internally
+    builds a TrainEvaluator. The TrainEvaluator builds a pipeline, stores the output files
+    to disc via the backend, and puts the performance result of the run in the queue.
+
+
+    Attributes:
+        backend (Backend):
+            An object to interface with the disk storage. In particular, allows to
+            access the train and test datasets
+        queue (Queue):
+            Each worker available will instantiate an evaluator, and after completion,
+            it will return the evaluation result via a multiprocessing queue
+        metric (autoPyTorchMetric):
+            A scorer object that is able to evaluate how good a pipeline was fit. It
+            is a wrapper on top of the actual score method (a wrapper on top of scikit
+            lean accuracy for example) that formats the predictions accordingly.
+        budget: (float):
+            The amount of epochs/time a configuration is allowed to run.
+        budget_type  (str):
+            The budget type, which can be epochs or time
+        pipeline_config (Optional[Dict[str, Any]]):
+            Defines the content of the pipeline being evaluated. For example, it
+            contains pipeline specific settings like logging name, or whether or not
+            to use tensorboard.
+        config (Union[int, str, Configuration]):
+            Determines the pipeline to be constructed.
+        seed (int):
+            A integer that allows for reproducibility of results
+        output_y_hat_optimization (bool):
+            Whether this worker should output the target predictions, so that they are
+            stored on disk. Fundamentally, the resampling strategy might shuffle the
+            Y_train targets, so we store the split in order to re-use them for ensemble
+            selection.
+        num_run (Optional[int]):
+            An identifier of the current configuration being fit. This number is unique per
+            configuration.
+        include (Optional[Dict[str, Any]]):
+            An optional dictionary to include components of the pipeline steps.
+        exclude (Optional[Dict[str, Any]]):
+            An optional dictionary to exclude components of the pipeline steps.
+        disable_file_output (Union[bool, List[str]]):
+            By default, the model, it's predictions and other metadata is stored on disk
+            for each finished configuration. This argument allows the user to skip
+            saving certain file type, for example the model, from being written to disk.
+        init_params (Optional[Dict[str, Any]]):
+            Optional argument that is passed to each pipeline step. It is the equivalent of
+            kwargs for the pipeline steps.
+        logger_port (Optional[int]):
+            Logging is performed using a socket-server scheme to be robust against many
+            parallel entities that want to write to the same file. This integer states the
+            socket port for the communication channel. If None is provided, a traditional
+            logger is used.
+        instance (str):
+            An instance on which to evaluate the current pipeline. By default we work
+            with a single instance, being the provided X_train, y_train of a single dataset.
+            This instance is a compatibility argument for SMAC, that is capable of working
+            with multiple datasets at the same time.
+    """
     evaluator = FitEvaluator(
         backend=backend,
         queue=queue,
         metric=metric,
         configuration=config,
         seed=seed,
         num_run=num_run,
-        output_y_hat_optimization=output_y_hat_optimization,
         include=include,
         exclude=exclude,
         disable_file_output=disable_file_output,
diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py
@@ -30,9 +30,6 @@
     HoldoutValTypes,
     NoResamplingStrategyTypes
 )
-import autoPyTorch.evaluation.fit_evaluator
-import autoPyTorch.evaluation.train_evaluator
-from autoPyTorch.automl_common.common.utils.backend import Backend
 from autoPyTorch.evaluation.utils import (
     DisableFileOutputParameters,
     empty_queue,
diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py
@@ -418,10 +418,10 @@ def eval_function(
     budget: float,
     config: Optional[Configuration],
     seed: int,
-    output_y_hat_optimization: bool,
     num_run: int,
     include: Optional[Dict[str, Any]],
     exclude: Optional[Dict[str, Any]],
+    output_y_hat_optimization: bool,
     disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
     pipeline_config: Optional[Dict[str, Any]] = None,
     budget_type: str = None,