1- from math import ceil
21from typing import Any , Dict , List , Optional , Union
32
43from ConfigSpace .configuration_space import ConfigurationSpace
54from ConfigSpace .hyperparameters import (
5+ CategoricalHyperparameter ,
66 UniformFloatHyperparameter ,
7+ UniformIntegerHyperparameter ,
78)
89
910import numpy as np
1617from autoPyTorch .utils .common import HyperparameterSearchSpace , add_hyperparameter
1718
1819
20+ def get_num_output_dimensions (config : Dict [str , Any ], num_categs_per_feature : List [int ]) -> List [int ]:
21+ """
22+ Returns list of embedding sizes for each categorical variable.
23+ Selects this adaptively based on training_datset.
24+ Note: Assumes there is at least one embed feature.
25+
26+ Args:
27+ config (Dict[str, Any]):
28+ contains the hyperparameters required to calculate the `num_output_dimensions`
29+ num_categs_per_feature (List[int]):
30+ list containing number of categories for each feature that is to be embedded,
31+ 0 if the column is not an embed column
32+
33+ Returns:
34+ List[int]:
35+ list containing the output embedding size for each column,
36+ 1 if the column is not an embed column
37+ """
38+
39+ max_embedding_dim = config ['max_embedding_dim' ]
40+ embed_exponent = config ['embed_exponent' ]
41+ size_factor = config ['embedding_size_factor' ]
42+ num_output_dimensions = [int (size_factor * max (
43+ 2 ,
44+ min (max_embedding_dim ,
45+ 1.6 * num_categories ** embed_exponent )))
46+ if num_categories > 0 else 1 for num_categories in num_categs_per_feature ]
47+ return num_output_dimensions
48+
49+
1950class _LearnedEntityEmbedding (nn .Module ):
2051 """ Learned entity embedding module for categorical features"""
2152
@@ -35,9 +66,7 @@ def __init__(self, config: Dict[str, Any], num_categories_per_col: np.ndarray, n
3566
3667 self .num_embed_features = self .num_categories_per_col [self .embed_features ]
3768
38- self .num_output_dimensions = [1 ] * num_features_excl_embed
39- self .num_output_dimensions .extend ([ceil (config ["dimension_reduction_" + str (i )] * num_in ) for i , num_in in
40- enumerate (self .num_embed_features )])
69+ self .num_output_dimensions = get_num_output_dimensions (config , self .num_categories_per_col )
4170
4271 self .num_out_feats = num_features_excl_embed + sum (self .num_output_dimensions )
4372
@@ -48,12 +77,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
4877 # before passing it through the model
4978 concat_seq = []
5079
51- x_pointer = 0
5280 layer_pointer = 0
5381 for x_pointer , embed in enumerate (self .embed_features ):
5482 current_feature_slice = x [:, x_pointer ]
5583 if not embed :
56- x_pointer += 1
5784 concat_seq .append (current_feature_slice .view (- 1 , 1 ))
5885 continue
5986 current_feature_slice = current_feature_slice .to (torch .int )
@@ -91,28 +118,24 @@ def build_embedding(self, num_categories_per_col: np.ndarray, num_features_excl_
91118 @staticmethod
92119 def get_hyperparameter_search_space (
93120 dataset_properties : Optional [Dict [str , BaseDatasetPropertiesType ]] = None ,
94- dimension_reduction : HyperparameterSearchSpace = HyperparameterSearchSpace (hyperparameter = "dimension_reduction" ,
95- value_range = (0 , 1 ),
96- default_value = 0.5 ),
121+ embed_exponent : HyperparameterSearchSpace = HyperparameterSearchSpace (hyperparameter = "embed_exponent" ,
122+ value_range = (0.56 ,),
123+ default_value = 0.56 ),
124+ max_embedding_dim : HyperparameterSearchSpace = HyperparameterSearchSpace (hyperparameter = "max_embedding_dim" ,
125+ value_range = (100 ,),
126+ default_value = 100 ),
127+ embedding_size_factor : HyperparameterSearchSpace = HyperparameterSearchSpace (hyperparameter = "embedding_size_factor" ,
128+ value_range = (0.5 , 0.6 , 0.7 , 0.8 , 0.9 , 1.0 , 1.1 , 1.2 , 1.3 , 1.4 , 1.5 ),
129+ default_value = 1 ,
130+ ),
97131 ) -> ConfigurationSpace :
98132 cs = ConfigurationSpace ()
99133 if dataset_properties is not None :
100- for i in range (len (dataset_properties ['categorical_columns' ])
101- if isinstance (dataset_properties ['categorical_columns' ], List ) else 0 ):
102- # currently as we dont have information about the embedding columns
103- # we search for more dimensions than necessary. This can be solved by
104- # not having `min_unique_values_for_embedding` as a hyperparameter and
105- # instead passing it as a parameter to the feature validator, which
106- # allows us to pass embed_columns to the dataset properties.
107- # TODO: test the trade off
108- # Another solution is to combine `OneHotEncoding`, `Embedding` and `NoEncoding`
109- # in one custom transformer. this will also allow users to use this transformer
110- # outside the pipeline
111- ee_dimensions_search_space = HyperparameterSearchSpace (hyperparameter = "dimension_reduction_" + str (i ),
112- value_range = dimension_reduction .value_range ,
113- default_value = dimension_reduction .default_value ,
114- log = dimension_reduction .log )
115- add_hyperparameter (cs , ee_dimensions_search_space , UniformFloatHyperparameter )
134+ if len (dataset_properties ['categorical_columns' ]) > 0 :
135+ add_hyperparameter (cs , embed_exponent , UniformFloatHyperparameter )
136+ add_hyperparameter (cs , max_embedding_dim , UniformIntegerHyperparameter )
137+ add_hyperparameter (cs , embedding_size_factor , CategoricalHyperparameter )
138+
116139 return cs
117140
118141 @staticmethod
0 commit comments