@@ -204,7 +204,6 @@ def test_featurevalidator_supported_types(input_data_featuretest):
204204 assert sparse .issparse (transformed_X )
205205 else :
206206 assert isinstance (transformed_X , np .ndarray )
207- assert np .shape (input_data_featuretest ) == np .shape (transformed_X )
208207 assert np .issubdtype (transformed_X .dtype , np .number )
209208 assert validator ._is_fitted
210209
@@ -237,11 +236,10 @@ def test_featurevalidator_categorical_nan(input_data_featuretest):
237236 validator .fit (input_data_featuretest )
238237 transformed_X = validator .transform (input_data_featuretest )
239238 assert any (pd .isna (input_data_featuretest ))
240- categories_ = validator .column_transformer .named_transformers_ [ 'categorical_pipeline' ]. \
241- named_steps ['ordinalencoder ' ].categories_
239+ categories_ = validator .column_transformer .\
240+ named_transformers_ [ 'categorical_pipeline' ]. named_steps ['onehotencoder ' ].categories_
242241 assert any (('0' in categories ) or (0 in categories ) or ('missing_value' in categories ) for categories in
243242 categories_ )
244- assert np .shape (input_data_featuretest ) == np .shape (transformed_X )
245243 assert np .issubdtype (transformed_X .dtype , np .number )
246244 assert validator ._is_fitted
247245 assert isinstance (transformed_X , np .ndarray )
@@ -294,7 +292,6 @@ def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest):
294292 else :
295293 raise ValueError (type (input_data_featuretest ))
296294 transformed_X = validator .transform (complementary_type )
297- assert np .shape (input_data_featuretest ) == np .shape (transformed_X )
298295 assert np .issubdtype (transformed_X .dtype , np .number )
299296 assert validator ._is_fitted
300297
@@ -314,12 +311,6 @@ def test_featurevalidator_get_columns_to_encode():
314311 for col in df .columns :
315312 df [col ] = df [col ].astype (col )
316313
317- < << << << HEAD
318- transformed_columns , feature_types = validator ._get_columns_to_encode (df )
319-
320- assert transformed_columns == ['category' , 'bool' ]
321- assert feature_types == ['numerical' , 'numerical' , 'categorical' , 'categorical' ]
322- == == == =
323314 validator .fit (df )
324315
325316 categorical_columns , numerical_columns , feat_type = validator ._get_columns_info (df )
@@ -435,7 +426,6 @@ def test_feature_validator_remove_nan_catcolumns():
435426 )
436427 ans_test = np .array ([[0 , 0 , 0 , 0 ], [0 , 0 , 0 , 0 ]], dtype = np .float64 )
437428 feature_validator_remove_nan_catcolumns (df_train , df_test , ans_train , ans_test )
438- >> >> >> > Bug fixes (#249)
439429
440430
441431def test_features_unsupported_calls_are_raised ():
@@ -445,36 +435,29 @@ def test_features_unsupported_calls_are_raised():
445435 expected
446436 """
447437 validator = TabularFeatureValidator ()
448- with pytest .raises (ValueError , match = r"AutoPyTorch does not support time " ):
438+ with pytest .raises (TypeError , match = r".*?Convert the time information to a numerical value " ):
449439 validator .fit (
450440 pd .DataFrame ({'datetime' : [pd .Timestamp ('20180310' )]})
451441 )
442+ validator = TabularFeatureValidator ()
452443 with pytest .raises (ValueError , match = r"AutoPyTorch only supports.*yet, the provided input" ):
453444 validator .fit ({'input1' : 1 , 'input2' : 2 })
454- with pytest .raises (ValueError , match = r"has unsupported dtype string" ):
445+ validator = TabularFeatureValidator ()
446+ with pytest .raises (TypeError , match = r".*?but input column A has an invalid type `string`.*" ):
455447 validator .fit (pd .DataFrame ([{'A' : 1 , 'B' : 2 }], dtype = 'string' ))
448+ validator = TabularFeatureValidator ()
456449 with pytest .raises (ValueError , match = r"The feature dimensionality of the train and test" ):
457450 validator .fit (X_train = np .array ([[1 , 2 , 3 ], [4 , 5 , 6 ]]),
458451 X_test = np .array ([[1 , 2 , 3 , 4 ], [4 , 5 , 6 , 7 ]]),
459452 )
453+ validator = TabularFeatureValidator ()
460454 with pytest .raises (ValueError , match = r"Cannot call transform on a validator that is not fit" ):
461455 validator .transform (np .array ([[1 , 2 , 3 ], [4 , 5 , 6 ]]))
462456
463457
464458@pytest .mark .parametrize (
465459 'input_data_featuretest' ,
466460 (
467- 'numpy_numericalonly_nonan' ,
468- 'numpy_numericalonly_nan' ,
469- 'pandas_numericalonly_nonan' ,
470- 'pandas_numericalonly_nan' ,
471- 'list_numericalonly_nonan' ,
472- 'list_numericalonly_nan' ,
473- # Category in numpy is handled via feat_type
474- 'numpy_categoricalonly_nonan' ,
475- 'numpy_mixed_nonan' ,
476- 'numpy_categoricalonly_nan' ,
477- 'numpy_mixed_nan' ,
478461 'sparse_bsr_nonan' ,
479462 'sparse_bsr_nan' ,
480463 'sparse_coo_nonan' ,
@@ -512,7 +495,7 @@ def test_no_column_transformer_created(input_data_featuretest):
512495)
513496def test_column_transformer_created (input_data_featuretest ):
514497 """
515- This test ensures an encoder is created if categorical data is provided
498+ This test ensures an column transformer is created if categorical data is provided
516499 """
517500 validator = TabularFeatureValidator ()
518501 validator .fit (input_data_featuretest )
@@ -521,7 +504,7 @@ def test_column_transformer_created(input_data_featuretest):
521504
522505 # Make sure that the encoded features are actually encoded. Categorical columns are at
523506 # the start after transformation. In our fixtures, this is also honored prior encode
524- transformed_columns , feature_types = validator ._get_columns_to_encode (input_data_featuretest )
507+ cat_columns , _ , feature_types = validator ._get_columns_info (input_data_featuretest )
525508
526509 # At least one categorical
527510 assert 'categorical' in validator .feat_type
@@ -530,20 +513,13 @@ def test_column_transformer_created(input_data_featuretest):
530513 if np .any ([pd .api .types .is_numeric_dtype (input_data_featuretest [col ]
531514 ) for col in input_data_featuretest .columns ]):
532515 assert 'numerical' in validator .feat_type
533- for i , feat_type in enumerate (feature_types ):
534- if 'numerical' in feat_type :
535- np .testing .assert_array_equal (
536- transformed_X [:, i ],
537- input_data_featuretest [input_data_featuretest .columns [i ]].to_numpy ()
538- )
539- elif 'categorical' in feat_type :
540- np .testing .assert_array_equal (
541- transformed_X [:, i ],
542- # Expect always 0, 1... because we use a ordinal encoder
543- np .array ([0 , 1 ])
544- )
545- else :
546- raise ValueError (feat_type )
516+ # we expect this input to be the fixture 'pandas_mixed_nan'
517+ np .testing .assert_array_equal (transformed_X , np .array ([[1. , 0. , - 1. ], [0. , 1. , 1. ]]))
518+ else :
519+ np .testing .assert_array_equal (transformed_X , np .array ([[1. , 0. , 1. , 0. ], [0. , 1. , 0. , 1. ]]))
520+
521+ if not all ([feat_type in ['numerical' , 'categorical' ] for feat_type in feature_types ]):
522+ raise ValueError ("Expected only numerical and categorical feature types" )
547523
548524
549525def test_no_new_category_after_fit ():
@@ -575,13 +551,12 @@ def test_unknown_encode_value():
575551 x ['c' ].cat .add_categories (['NA' ], inplace = True )
576552 x .loc [0 , 'c' ] = 'NA' # unknown value
577553 x_t = validator .transform (x )
578- # The first row should have a -1 as we added a new categorical there
579- expected_row = [- 1 , - 41 , - 3 , - 987.2 ]
554+ # The first row should have a 0, 0 as we added a
555+ # new categorical there and one hot encoder marks
556+ # it as all zeros for the transformed column
557+ expected_row = [0.0 , 0.0 , - 0.5584294383572701 , 0.5000000000000004 , - 1.5136598016833485 ]
580558 assert expected_row == x_t [0 ].tolist ()
581559
582- # Notice how there is only one column 'c' to encode
583- assert validator .categories == [list (range (2 )) for i in range (1 )]
584-
585560
586561# Actual checks for the features
587562@pytest .mark .parametrize (
@@ -633,19 +608,20 @@ def test_feature_validator_new_data_after_fit(
633608 assert sparse .issparse (transformed_X )
634609 else :
635610 assert isinstance (transformed_X , np .ndarray )
636- assert np .shape (X_test ) == np .shape (transformed_X )
637611
638612 # And then check proper error messages
639613 if train_data_type == 'pandas' :
640614 old_dtypes = copy .deepcopy (validator .dtypes )
641615 validator .dtypes = ['dummy' for dtype in X_train .dtypes ]
642- with pytest .raises (ValueError , match = r"Changing the dtype of the features after fit" ):
616+ with pytest .raises (ValueError ,
617+ match = r"The dtype of the features must not be changed after fit" ):
643618 transformed_X = validator .transform (X_test )
644619 validator .dtypes = old_dtypes
645620 if test_data_type == 'pandas' :
646621 columns = X_test .columns .tolist ()
647622 X_test = X_test [reversed (columns )]
648- with pytest .raises (ValueError , match = r"Changing the column order of the features" ):
623+ with pytest .raises (ValueError ,
624+ match = r"The column order of the features must not be changed after fit" ):
649625 transformed_X = validator .transform (X_test )
650626
651627
0 commit comments