Internal change

achoum · copybara-github · commit b879c30923ea · 2021-07-29T06:22:11.000-07:00
PiperOrigin-RevId: 387569213
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,6 @@
 # Changelog
 
-## 0.1.8 - ???
+## 0.1.8 - 2021-07-28
 
 ### Features
 
diff --git a/configure/setup.py b/configure/setup.py
@@ -20,7 +20,7 @@
 from setuptools.command.install import install
 from setuptools.dist import Distribution
 
-_VERSION = "0.1.7"
+_VERSION = "0.1.8"
 
 with open("README.md", "r", encoding="utf-8") as fh:
   long_description = fh.read()
diff --git a/tensorflow_decision_forests/__init__.py b/tensorflow_decision_forests/__init__.py
@@ -45,7 +45,7 @@
 
 """
 
-__version__ = "0.1.7"
+__version__ = "0.1.8"
 __author__ = "Mathieu Guillame-Bert"
 
 from tensorflow_decision_forests import keras
diff --git a/tensorflow_decision_forests/keras/wrappers_pre_generated.py b/tensorflow_decision_forests/keras/wrappers_pre_generated.py
@@ -83,13 +83,17 @@ class RandomForestModel(core.CoreModel):
       the raw input). Can be used to prepare the features or to stack multiple
       models on top of each other. Unlike preprocessing done in the tf.dataset,
       the operation in "preprocessing" are serialized with the model.
+    postprocessing: Like "preprocessing" but applied on the model output.
     ranking_group: Only for `task=Task.RANKING`. Name of a tf.string feature that
       identifies queries in a query/document ranking task. The ranking group
       is not added automatically for the set of features if
       `exclude_non_specified_features=false`.
-    temp_directory: Temporary directory used during the training. The space
-      required depends on the learner. In many cases, only a temporary copy of a
-      model will be there.
+    temp_directory: Temporary directory used to store the model Assets after the
+      training, and possibly as a work directory during the training. This
+      temporary directory is necessary for the model to be exported after
+      training e.g. `model.save(path)`. If not specified, `temp_directory` is
+      set to a temporary directory using `tempfile.TemporaryDirectory`. This
+      directory is deleted when the model python object is garbage-collected.
     verbose: If true, displays information about the training.
     hyperparameter_template: Override the default value of the hyper-parameters.
       If None (default) the default parameters of the library are used. If set,
@@ -110,6 +114,11 @@ class RandomForestModel(core.CoreModel):
 
     advanced_arguments: Advanced control of the model that most users won't need
       to use. See `AdvancedArguments` for details.
+    num_threads: Number of threads used to train the model. Different learning
+      algorithms use multi-threading differently and with different degree of
+      efficiency. If specified, `num_threads` field of the
+      `advanced_arguments.yggdrasil_deployment_config` has priority.
+    name: The name of the model.
     adapt_bootstrap_size_ratio_for_maximum_training_duration: Control how the
       maximum training duration (if set) is applied. If false, the training
       stop when the time is used. If true, adapts the size of the sampled
@@ -254,11 +263,14 @@ def __init__(
       features: Optional[List[core.FeatureUsage]] = None,
       exclude_non_specified_features: Optional[bool] = False,
       preprocessing: Optional["tf.keras.models.Functional"] = None,
+      postprocessing: Optional["tf.keras.models.Functional"] = None,
       ranking_group: Optional[str] = None,
       temp_directory: Optional[str] = None,
       verbose: Optional[bool] = True,
       hyperparameter_template: Optional[str] = None,
       advanced_arguments: Optional[AdvancedArguments] = None,
+      num_threads: Optional[int] = 6,
+      name: Optional[str] = None,
       adapt_bootstrap_size_ratio_for_maximum_training_duration: Optional[
           bool] = False,
       allow_na_conditions: Optional[bool] = False,
@@ -349,10 +361,13 @@ def __init__(
         features=features,
         exclude_non_specified_features=exclude_non_specified_features,
         preprocessing=preprocessing,
+        postprocessing=postprocessing,
         ranking_group=ranking_group,
         temp_directory=temp_directory,
         verbose=verbose,
-        advanced_arguments=advanced_arguments)
+        advanced_arguments=advanced_arguments,
+        num_threads=num_threads,
+        name=name)
 
   @staticmethod
   def predefined_hyperparameters() -> List[core.HyperParameterTemplate]:
@@ -418,13 +433,17 @@ class GradientBoostedTreesModel(core.CoreModel):
       the raw input). Can be used to prepare the features or to stack multiple
       models on top of each other. Unlike preprocessing done in the tf.dataset,
       the operation in "preprocessing" are serialized with the model.
+    postprocessing: Like "preprocessing" but applied on the model output.
     ranking_group: Only for `task=Task.RANKING`. Name of a tf.string feature that
       identifies queries in a query/document ranking task. The ranking group
       is not added automatically for the set of features if
       `exclude_non_specified_features=false`.
-    temp_directory: Temporary directory used during the training. The space
-      required depends on the learner. In many cases, only a temporary copy of a
-      model will be there.
+    temp_directory: Temporary directory used to store the model Assets after the
+      training, and possibly as a work directory during the training. This
+      temporary directory is necessary for the model to be exported after
+      training e.g. `model.save(path)`. If not specified, `temp_directory` is
+      set to a temporary directory using `tempfile.TemporaryDirectory`. This
+      directory is deleted when the model python object is garbage-collected.
     verbose: If true, displays information about the training.
     hyperparameter_template: Override the default value of the hyper-parameters.
       If None (default) the default parameters of the library are used. If set,
@@ -445,6 +464,11 @@ class GradientBoostedTreesModel(core.CoreModel):
 
     advanced_arguments: Advanced control of the model that most users won't need
       to use. See `AdvancedArguments` for details.
+    num_threads: Number of threads used to train the model. Different learning
+      algorithms use multi-threading differently and with different degree of
+      efficiency. If specified, `num_threads` field of the
+      `advanced_arguments.yggdrasil_deployment_config` has priority.
+    name: The name of the model.
     adapt_subsample_for_maximum_training_duration: Control how the maximum
       training duration (if set) is applied. If false, the training stop when
       the time is used. If true, the size of the sampled datasets used train
@@ -644,11 +668,14 @@ def __init__(
       features: Optional[List[core.FeatureUsage]] = None,
       exclude_non_specified_features: Optional[bool] = False,
       preprocessing: Optional["tf.keras.models.Functional"] = None,
+      postprocessing: Optional["tf.keras.models.Functional"] = None,
       ranking_group: Optional[str] = None,
       temp_directory: Optional[str] = None,
       verbose: Optional[bool] = True,
       hyperparameter_template: Optional[str] = None,
       advanced_arguments: Optional[AdvancedArguments] = None,
+      num_threads: Optional[int] = 6,
+      name: Optional[str] = None,
       adapt_subsample_for_maximum_training_duration: Optional[bool] = False,
       allow_na_conditions: Optional[bool] = False,
       apply_link_function: Optional[bool] = True,
@@ -780,10 +807,13 @@ def __init__(
         features=features,
         exclude_non_specified_features=exclude_non_specified_features,
         preprocessing=preprocessing,
+        postprocessing=postprocessing,
         ranking_group=ranking_group,
         temp_directory=temp_directory,
         verbose=verbose,
-        advanced_arguments=advanced_arguments)
+        advanced_arguments=advanced_arguments,
+        num_threads=num_threads,
+        name=name)
 
   @staticmethod
   def predefined_hyperparameters() -> List[core.HyperParameterTemplate]:
@@ -848,32 +878,44 @@ class CartModel(core.CoreModel):
       the raw input). Can be used to prepare the features or to stack multiple
       models on top of each other. Unlike preprocessing done in the tf.dataset,
       the operation in "preprocessing" are serialized with the model.
-    ranking_group: Only for `task=Task.RANKING`. Name of a tf.string feature
-      that identifies queries in a query/document ranking task. The ranking
-      group is not added automatically for the set of features if
+    postprocessing: Like "preprocessing" but applied on the model output.
+    ranking_group: Only for `task=Task.RANKING`. Name of a tf.string feature that
+      identifies queries in a query/document ranking task. The ranking group
+      is not added automatically for the set of features if
       `exclude_non_specified_features=false`.
-    temp_directory: Temporary directory used during the training. The space
-      required depends on the learner. In many cases, only a temporary copy of a
-      model will be there.
+    temp_directory: Temporary directory used to store the model Assets after the
+      training, and possibly as a work directory during the training. This
+      temporary directory is necessary for the model to be exported after
+      training e.g. `model.save(path)`. If not specified, `temp_directory` is
+      set to a temporary directory using `tempfile.TemporaryDirectory`. This
+      directory is deleted when the model python object is garbage-collected.
     verbose: If true, displays information about the training.
     hyperparameter_template: Override the default value of the hyper-parameters.
       If None (default) the default parameters of the library are used. If set,
       `default_hyperparameter_template` refers to one of the following
       preconfigured hyper-parameter sets. Those sets outperforms the default
-      hyper-parameters (either generally or in specific scenarios). You can omit
-      the version (e.g. remove "@v5") to use the last version of the template.
-      In this case, the hyper-parameter can change in between releases (not
-      recommended for training in production).
+      hyper-parameters (either generally or in specific scenarios).
+      You can omit the version (e.g. remove "@v5") to use the last version of
+      the template. In this case, the hyper-parameter can change in between
+      releases (not recommended for training in production).
+      
+
     advanced_arguments: Advanced control of the model that most users won't need
       to use. See `AdvancedArguments` for details.
+    num_threads: Number of threads used to train the model. Different learning
+      algorithms use multi-threading differently and with different degree of
+      efficiency. If specified, `num_threads` field of the
+      `advanced_arguments.yggdrasil_deployment_config` has priority.
+    name: The name of the model.
     allow_na_conditions: If true, the tree training evaluates conditions of the
       type `X is NA` i.e. `X is missing`. Default: False.
     categorical_algorithm: How to learn splits on categorical attributes.
       - `CART`: CART algorithm. Find categorical splits of the form "value \\in
         mask". The solution is exact for binary classification, regression and
         ranking. It is approximated for multi-class classification. This is a
-        good first algorithm to use. In case of overfitting (very small dataset,
-        large dictionary), the "random" algorithm is a good alternative.
+        good first algorithm to use. In case of overfitting (very small
+        dataset, large dictionary), the "random" algorithm is a good
+        alternative.
       - `ONE_HOT`: One-hot encoding. Find the optimal categorical split of the
         form "attribute == param". This method is similar (but more efficient)
         than converting converting each possible categorical value into a
@@ -894,7 +936,7 @@ class CartModel(core.CoreModel):
       available, the least frequent items are ignored. Changing this value is
       similar to change the "max_vocab_count" before loading the dataset, with
       the following exception: With `max_vocab_count`, all the remaining items
-        are grouped in a special Out-of-vocabulary item. With `max_num_items`,
+      are grouped in a special Out-of-vocabulary item. With `max_num_items`,
       this is not the case. Default: -1.
     categorical_set_split_min_item_frequency: For categorical set splits e.g.
       texts. Minimum number of occurrences of an item to be considered.
@@ -904,16 +946,16 @@ class CartModel(core.CoreModel):
         words, as long as a node satisfy the splits "constraints (e.g. maximum
         depth, minimum number of observations), the node will be split. This is
         the "classical" way to grow decision trees.
-      - `BEST_FIRST_GLOBAL`: The node with the best loss reduction among all the
-        nodes of the tree is selected for splitting. This method is also called
-        "best first" or "leaf-wise growth". See "Best-first decision
+      - `BEST_FIRST_GLOBAL`: The node with the best loss reduction among all
+        the nodes of the tree is selected for splitting. This method is also
+        called "best first" or "leaf-wise growth". See "Best-first decision
         tree learning", Shi and "Additive logistic regression : A statistical
         view of boosting", Friedman for more details. Default: "LOCAL".
     in_split_min_examples_check: Whether to check the `min_examples` constraint
       in the split search (i.e. splits leading to one child having less than
-      `min_examples` examples are considered invalid) or before the split search
-      (i.e. a node can be derived only if it contains more than `min_examples`
-      examples). If false, there can be nodes with less than
+      `min_examples` examples are considered invalid) or before the split
+      search (i.e. a node can be derived only if it contains more than
+      `min_examples` examples). If false, there can be nodes with less than
       `min_examples` training examples. Default: True.
     max_depth: Maximum depth of the tree. `max_depth=1` means that all trees
       will be roots. Negative values are ignored. Default: 16.
@@ -926,9 +968,9 @@ class CartModel(core.CoreModel):
       model training non-deterministic. Default: -1.0.
     min_examples: Minimum number of examples in a node. Default: 5.
     missing_value_policy: Method used to handle missing attribute values.
-      - `GLOBAL_IMPUTATION`: Missing attribute values are imputed, with the mean
-        (in case of numerical attribute) or the most-frequent-item (in case of
-        categorical attribute) computed on the entire dataset (i.e. the
+      - `GLOBAL_IMPUTATION`: Missing attribute values are imputed, with the
+        mean (in case of numerical attribute) or the most-frequent-item (in
+        case of categorical attribute) computed on the entire dataset (i.e. the
         information contained in the data spec).
       - `LOCAL_IMPUTATION`: Missing attribute values are imputed with the mean
         (numerical attribute) or most-frequent-item (in the case of categorical
@@ -942,24 +984,24 @@ class CartModel(core.CoreModel):
       node. An attribute is valid if it has at least a valid split. If
       `num_candidate_attributes=0`, the value is set to the classical default
       value for Random Forest: `sqrt(number of input attributes)` in case of
-        classification and `number_of_input_attributes / 3` in case of
-        regression. If `num_candidate_attributes=-1`, all the attributes are
+      classification and `number_of_input_attributes / 3` in case of
+      regression. If `num_candidate_attributes=-1`, all the attributes are
       tested. Default: 0.
     num_candidate_attributes_ratio: Ratio of attributes tested at each node. If
       set, it is equivalent to `num_candidate_attributes =
       number_of_input_features x num_candidate_attributes_ratio`. The possible
       values are between ]0, and 1] as well as -1. If not set or equal to -1,
       the `num_candidate_attributes` is used. Default: -1.0.
-    sorting_strategy: How are sorted the numerical features in order to find the
-      splits
+    sorting_strategy: How are sorted the numerical features in order to find
+      the splits
       - PRESORT: The features are pre-sorted at the start of the training. This
         solution is faster but consumes much more memory than IN_NODE.
       - IN_NODE: The features are sorted just before being used in the node.
         This solution is slow but consumes little amount of memory.
       . Default: "PRESORT".
     sparse_oblique_normalization: For sparse oblique splits i.e.
-      `split_axis=SPARSE_OBLIQUE`. Normalization applied on the features, before
-      applying the sparse oblique projections.
+      `split_axis=SPARSE_OBLIQUE`. Normalization applied on the features,
+      before applying the sparse oblique projections.
       - `NONE`: No normalization.
       - `STANDARD_DEVIATION`: Normalize the feature by the estimated standard
         deviation on the entire train dataset. Also known as Z-Score
@@ -969,19 +1011,20 @@ class CartModel(core.CoreModel):
     sparse_oblique_num_projections_exponent: For sparse oblique splits i.e.
       `split_axis=SPARSE_OBLIQUE`. Controls of the number of random projections
       to test at each node as `num_features^num_projections_exponent`. Default:
-        None.
+      None.
     sparse_oblique_projection_density_factor: For sparse oblique splits i.e.
       `split_axis=SPARSE_OBLIQUE`. Controls of the number of random projections
       to test at each node as `num_features^num_projections_exponent`. Default:
-        None.
+      None.
     split_axis: What structure of split to consider for numerical features.
-      - `AXIS_ALIGNED`: Axis aligned splits (i.e. one condition at a time). This
-        is the "classical" way to train a tree. Default value.
+      - `AXIS_ALIGNED`: Axis aligned splits (i.e. one condition at a time).
+        This is the "classical" way to train a tree. Default value.
       - `SPARSE_OBLIQUE`: Sparse oblique splits (i.e. splits one a small number
         of features) from "Sparse Projection Oblique Random Forests", Tomita et
         al., 2020. Default: "AXIS_ALIGNED".
     validation_ratio: Ratio of the training dataset used to create the
       validation dataset used to prune the tree. Default: 0.1.
+
   """
 
   @core._list_explicit_arguments
@@ -990,11 +1033,14 @@ def __init__(self,
                features: Optional[List[core.FeatureUsage]] = None,
                exclude_non_specified_features: Optional[bool] = False,
                preprocessing: Optional["tf.keras.models.Functional"] = None,
+               postprocessing: Optional["tf.keras.models.Functional"] = None,
                ranking_group: Optional[str] = None,
                temp_directory: Optional[str] = None,
                verbose: Optional[bool] = True,
                hyperparameter_template: Optional[str] = None,
                advanced_arguments: Optional[AdvancedArguments] = None,
+               num_threads: Optional[int] = 6,
+               name: Optional[str] = None,
                allow_na_conditions: Optional[bool] = False,
                categorical_algorithm: Optional[str] = "CART",
                categorical_set_split_greedy_sampling: Optional[float] = 0.1,
@@ -1072,10 +1118,13 @@ def __init__(self,
         features=features,
         exclude_non_specified_features=exclude_non_specified_features,
         preprocessing=preprocessing,
+        postprocessing=postprocessing,
         ranking_group=ranking_group,
         temp_directory=temp_directory,
         verbose=verbose,
-        advanced_arguments=advanced_arguments)
+        advanced_arguments=advanced_arguments,
+        num_threads=num_threads,
+        name=name)
 
   @staticmethod
   def predefined_hyperparameters() -> List[core.HyperParameterTemplate]:
diff --git a/tools/test_bazel.sh b/tools/test_bazel.sh