diff --git a/tests/creators/creator_stringclustering.py b/tests/creators/creator_stringclustering.py new file mode 100644 index 000000000..45430904b --- /dev/null +++ b/tests/creators/creator_stringclustering.py @@ -0,0 +1,115 @@ +import datetime +import sys +sys.path.append("../..") + + +def create(): + from optimus import Optimus + from optimus.tests.creator import TestCreator, default_configs + + op = Optimus("pandas") + df = op.create.dataframe({ + 'NullType': [None, None, None, None, None, None], + 'date arrival': ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], + 'height(ft)': [-28, 17, 26, 13, None, 300], + ('last date seen', 'date'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], + 'last position seen': ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], + 'rank': [10, 7, 7, 8, 10, 8], + ('Cybertronian', 'bool'): [True, True, True, True, True, False], + ('Date Type'): [datetime.datetime(2016, 9, 10), datetime.datetime(2015, 8, 10), datetime.datetime(2014, 6, 24), datetime.datetime(2013, 6, 24), datetime.datetime(2012, 5, 10), datetime.datetime(2011, 4, 10)], + ('age', 'int'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], + ('function', 'string'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], + ('names', 'str'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], + ('timestamp', 'time'): [datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0)], + ('weight(t)', 'float'): [4.3, 2.0, 4.0, 1.8, 5.7, None] + }) + + t = TestCreator(op, df, name="stringclustering", configs=default_configs) + + t.create(method="string_clustering", variant="all_fingerprint", cols="*", algorithm="fingerprint") + t.create(method="string_clustering", variant="all_ngram_fingerprint", cols="*", algorithm="ngram_fingerprint") + t.create(method="string_clustering", variant="all_metaphone", cols="*", algorithm="metaphone") + t.create(method="string_clustering", variant="all_nysiis", cols="*", algorithm="nysiis") + t.create(method="string_clustering", variant="all_match_rating_codex", cols="*", algorithm="match_rating_codex") + t.create(method="string_clustering", variant="all_double_metaphone", cols="*", algorithm="double_metaphone") + t.create(method="string_clustering", variant="all_soundex", cols="*", algorithm="soundex") + t.create(method="string_clustering", variant="all_levenshtein", cols="*", algorithm="levenshtein") + + t.create(method="string_clustering", variant="numeric_fingerprint", cols=["rank"], algorithm="fingerprint") + t.create(method="string_clustering", variant="numeric_ngram_fingerprint", cols=["rank"], algorithm="ngram_fingerprint") + t.create(method="string_clustering", variant="numeric_metaphone", cols=["rank"], algorithm="metaphone") + t.create(method="string_clustering", variant="numeric_nysiis", cols=["rank"], algorithm="nysiis") + t.create(method="string_clustering", variant="numeric_match_rating_codex", cols=["rank"], algorithm="match_rating_codex") + t.create(method="string_clustering", variant="numeric_double_metaphone", cols=["rank"], algorithm="double_metaphone") + t.create(method="string_clustering", variant="numeric_soundex", cols=["rank"], algorithm="soundex") + t.create(method="string_clustering", variant="numeric_levenshtein", cols=["rank"], algorithm="levenshtein") + + t.create(method="string_clustering", variant="string_fingerprint", cols=["names"], algorithm="fingerprint") + t.create(method="string_clustering", variant="string_ngram_fingerprint", cols=["names"], algorithm="ngram_fingerprint") + t.create(method="string_clustering", variant="string_metaphone", cols=["names"], algorithm="metaphone") + t.create(method="string_clustering", variant="string_nysiis", cols=["names"], algorithm="nysiis") + t.create(method="string_clustering", variant="string_match_rating_codex", cols=["names"], algorithm="match_rating_codex") + t.create(method="string_clustering", variant="string_double_metaphone", cols=["names"], algorithm="double_metaphone") + t.create(method="string_clustering", variant="string_soundex", cols=["names"], algorithm="soundex") + t.create(method="string_clustering", variant="string_levenshtein", cols=["names"], algorithm="levenshtein") + + t.create(method="string_clustering", variant="multiple_fingerprint", cols=["NullType","Cybertronian","timestamp"], algorithm="fingerprint") + t.create(method="string_clustering", variant="multiple_ngram_fingerprint", cols=["NullType","Cybertronian","timestamp"], algorithm="ngram_fingerprint") + t.create(method="string_clustering", variant="multiple_metaphone", cols=["NullType","Cybertronian","timestamp"], algorithm="metaphone") + t.create(method="string_clustering", variant="multiple_nysiis", cols=["NullType","Cybertronian","timestamp"], algorithm="nysiis") + t.create(method="string_clustering", variant="multiple_match_rating_codex", cols=["NullType","Cybertronian","timestamp"], algorithm="match_rating_codex") + t.create(method="string_clustering", variant="multiple_double_metaphone", cols=["NullType","Cybertronian","timestamp"], algorithm="double_metaphone") + t.create(method="string_clustering", variant="multiple_soundex", cols=["NullType","Cybertronian","timestamp"], algorithm="soundex") + t.create(method="string_clustering", variant="multiple_levenshtein", cols=["NullType","Cybertronian","timestamp"], algorithm="levenshtein") + + t.create(method="cols.fingerprint", variant="all", cols="*") + t.create(method="cols.fingerprint", variant="string", cols=["names"]) + t.create(method="cols.fingerprint", variant="numeric", cols=["rank"], output_cols=["rk"]) + t.create(method="cols.fingerprint", variant="multiple", cols=["NullType","Cybertronian","timestamp"], output_cols=["nt","ct","ts"]) + + t.create(method="cols.pos", variant="single", cols=["names"]) + t.create(method="cols.pos", variant="multiple", cols=["date arrival","japanese name","last date seen"], output_cols=["da","jn","lds"]) + + t.create(method="cols.ngrams", variant="single", cols=["names"]) + t.create(method="cols.ngrams", variant="multiple", cols=["date arrival","japanese name","last date seen"], n_size=1, output_cols=["da","jn","lds"]) + + t.create(method="cols.ngram_fingerprint", variant="all", cols="*") + t.create(method="cols.ngram_fingerprint", variant="string", cols=["function(binary)"], n_size=25) + t.create(method="cols.ngram_fingerprint", variant="numeric", cols=["rank"], output_cols=["rk"]) + t.create(method="cols.ngram_fingerprint", variant="multiple", cols=["NullType","Cybertronian","timestamp"], n_size=4, output_cols=["nt","ct","ts"]) + + t.create(method="cols.metaphone", variant="all", cols="*") + t.create(method="cols.metaphone", variant="string", cols=["names"]) + t.create(method="cols.metaphone", variant="numeric", cols=["rank"], output_cols=["rk"]) + t.create(method="cols.metaphone", variant="multiple", cols=["NullType","Cybertronian","timestamp"], output_cols=["nt","ct","ts"]) + + t.create(method="cols.nysiis", variant="all", cols="*") + t.create(method="cols.nysiis", variant="string", cols=["names"]) + t.create(method="cols.nysiis", variant="numeric", cols=["rank"], output_cols=["rk"]) + t.create(method="cols.nysiis", variant="multiple", cols=["NullType","Cybertronian","timestamp"], output_cols=["nt","ct","ts"]) + + t.create(method="cols.match_rating_codex", variant="all", cols="*") + t.create(method="cols.match_rating_codex", variant="string", cols=["names"]) + t.create(method="cols.match_rating_codex", variant="numeric", cols=["rank"], output_cols=["rk"]) + t.create(method="cols.match_rating_codex", variant="multiple", cols=["NullType","Cybertronian","timestamp"], output_cols=["nt","ct","ts"]) + + t.create(method="cols.double_metaphone", variant="all", cols="*") + t.create(method="cols.double_metaphone", variant="string", cols=["names"]) + t.create(method="cols.double_metaphone", variant="numeric", cols=["rank"], output_cols=["rk"]) + t.create(method="cols.double_metaphone", variant="multiple", cols=["NullType","Cybertronian","timestamp"], output_cols=["nt","ct","ts"]) + + t.create(method="cols.soundex", variant="all", cols="*") + t.create(method="cols.soundex", variant="string", cols=["names"]) + t.create(method="cols.soundex", variant="numeric", cols=["rank"], output_cols=["rk"]) + t.create(method="cols.soundex", variant="multiple", cols=["NullType","Cybertronian","timestamp"], output_cols=["nt","ct","ts"]) + + t.create(method="cols.levenshtein", variant="all_value", cols="*", value=["1a#-s","ERR","d2e","0","[]","''","","1","lu","2016","5000000","aeiou","abc#&^","2014-06-23","nan."]) + t.create(method="cols.levenshtein", variant="all_col", cols="*", other_cols=['date arrival','weight(t)','age','height(ft)','japanese name','rank','last date seen','names','last position seen','Cybertronian','NullType','Date Type','function(binary)','function','timestamp']) + t.create(method="cols.levenshtein", variant="single_value", cols=["names"], value="prime", output_cols="nms") + t.create(method="cols.levenshtein", variant="single_col", cols=["rank"], other_cols=["weight(t)"]) + t.create(method="cols.levenshtein", variant="multiple_value", cols=["last position seen","age","japanese name"], value=["10005","000","['Bumble']"]) + t.create(method="cols.levenshtein", variant="multiple_col", cols=["NullType","Cybertronian","timestamp"], other_cols=["height(ft)","function","Date Type"], output_cols=["nt-ht","ct-ft","ts-dt"]) + + t.run() + +create() \ No newline at end of file diff --git a/tests/test_created__stringclustering.py b/tests/test_created__stringclustering.py new file mode 100644 index 000000000..f48a5093f --- /dev/null +++ b/tests/test_created__stringclustering.py @@ -0,0 +1,1330 @@ +import datetime +import numpy as np +from optimus.tests.base import TestBase +from optimus.helpers.json import json_encoding +from optimus.helpers.functions import deep_sort, df_dicts_equal, results_equal + + +def Timestamp(t): + return datetime.datetime.strptime(t, "%Y-%m-%d %H:%M:%S") + + +NaT = np.datetime64('NaT') +nan = float("nan") +inf = float("inf") + + +class TestStringclusteringPandas(TestBase): + config = {'engine': 'pandas'} + dict = {('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]} + maxDiff = None + + def test_cols_double_metaphone_all(self): + df = self.df.copy() + result = df.cols.double_metaphone(cols='*') + expected = self.create_dataframe(data={('NullType', 'object'): [('NN', ''), ('NN', ''), ('NN', ''), ('NN', ''), ('NN', ''), ('NN', '')], ('date arrival', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('', ''), ('', '')], ('height(ft)', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('NN', ''), ('', '')], ('last date seen', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('', ''), ('', '')], ('last position seen', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('NN', ''), ('NN', '')], ('rank', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('', ''), ('', '')], ('Cybertronian', 'object'): [('TR', ''), ('TR', ''), ('TR', ''), ('TR', ''), ('TR', ''), ('FLS', '')], ('Date Type', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('', ''), ('', '')], ('age', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('', ''), ('', '')], ('function', 'object'): [('LTR', ''), ('ASPNJ', 'ASPNK'), ('SKRT', ''), ('FRSTLTNNT', ''), ('NN', ''), ('PTLSTXN', '')], ('names', 'object'): [('APTMS', ''), ('PMPLLP', ''), ('MTRPLKS', ''), ('PMPLP', ''), ('MTRPPLKS', ''), ('MTRPLKSKSKSKSKS', '')], ('timestamp', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('', ''), ('', '')], ('weight(t)', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('', ''), ('NN', '')]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_double_metaphone_multiple(self): + df = self.df.copy() + result = df.cols.double_metaphone(cols=['NullType', 'Cybertronian', 'timestamp'], output_cols=['nt', 'ct', 'ts']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('nt', 'object'): [('NN', ''), ('NN', ''), ('NN', ''), ('NN', ''), ('NN', ''), ('NN', '')], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('ct', 'object'): [('TR', ''), ('TR', ''), ('TR', ''), ('TR', ''), ('TR', ''), ('FLS', '')], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('ts', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('', ''), ('', '')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_double_metaphone_numeric(self): + df = self.df.copy() + result = df.cols.double_metaphone(cols=['rank'], output_cols=['rk']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('rk', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('', ''), ('', '')], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_double_metaphone_string(self): + df = self.df.copy() + result = df.cols.double_metaphone(cols=['names']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): [('APTMS', ''), ('PMPLLP', ''), ('MTRPLKS', ''), ('PMPLP', ''), ('MTRPPLKS', ''), ('MTRPLKSKSKSKSKS', '')], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_fingerprint_all(self): + df = self.df.copy() + result = df.cols.fingerprint(cols='*') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_fingerprint_multiple(self): + df = self.df.copy() + result = df.cols.fingerprint(cols=['NullType', 'Cybertronian', 'timestamp'], output_cols=['nt', 'ct', 'ts']) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_fingerprint_numeric(self): + df = self.df.copy() + result = df.cols.fingerprint(cols=['rank'], output_cols=['rk']) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_fingerprint_string(self): + df = self.df.copy() + result = df.cols.fingerprint(cols=['names']) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_levenshtein_all_col(self): + df = self.df.copy() + result = df.cols.levenshtein(cols='*', other_cols=['date arrival', 'weight(t)', 'age', 'height(ft)', 'japanese name', 'rank', 'last date seen', 'names', 'last position seen', 'Cybertronian', 'NullType', 'Date Type', 'function(binary)', 'function', 'timestamp']) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_levenshtein_all_value(self): + df = self.df.copy() + result = df.cols.levenshtein(cols='*', value=['1a#-s', 'ERR', 'd2e', '0', '[]', "''", '', '1', 'lu', '2016', '5000000', 'aeiou', 'abc#&^', '2014-06-23', 'nan.']) + expected = self.create_dataframe(data={('NullType', 'int64'): [5, 5, 5, 5, 5, 5], ('date arrival', 'int64'): [10, 10, 10, 10, 10, 10], ('height(ft)', 'int64'): [4, 4, 4, 4, 3, 5], ('last date seen', 'int64'): [9, 9, 9, 9, 9, 9], ('last position seen', 'int64'): [20, 20, 21, 21, 4, 4], ('rank', 'int64'): [2, 2, 2, 2, 2, 2], ('Cybertronian', 'int64'): [4, 4, 4, 4, 4, 5], ('Date Type', 'int64'): [9, 9, 9, 9, 9, 9], ('age', 'int64'): [7, 7, 7, 7, 7, 7], ('function', 'int64'): [6, 9, 8, 16, 4, 14], ('names', 'int64'): [7, 12, 9, 9, 11, 13], ('timestamp', 'int64'): [10, 10, 10, 10, 10, 10], ('weight(t)', 'int64'): [6, 6, 6, 6, 6, 6]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_levenshtein_multiple_col(self): + df = self.df.copy() + result = df.cols.levenshtein(cols=['NullType', 'Cybertronian', 'timestamp'], other_cols=['height(ft)', 'function', 'Date Type'], output_cols=['nt-ht', 'ct-ft', 'ts-dt']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ("NullType_['nt-ht', 'ct-ft', 'ts-dt']", 'int64'): [5, 4, 4, 4, 3, 5], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ("Cybertronian_['nt-ht', 'ct-ft', 'ts-dt']", 'int64'): [5, 8, 7, 13, 3, 12], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ("timestamp_['nt-ht', 'ct-ft', 'ts-dt']", 'int64'): [4, 4, 0, 1, 4, 4], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_levenshtein_multiple_value(self): + df = self.df.copy() + result = df.cols.levenshtein(cols=['last position seen', 'age', 'japanese name'], value=['10005', '000', "['Bumble']"]) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_levenshtein_single_col(self): + df = self.df.copy() + result = df.cols.levenshtein(cols=['rank'], other_cols=['weight(t)']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [3, 3, 3, 2, 3, 3], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_levenshtein_single_value(self): + df = self.df.copy() + result = df.cols.levenshtein(cols=['names'], value='prime', output_cols='nms') + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('nms', 'int64'): [4, 11, 7, 8, 9, 11], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_match_rating_codex_all(self): + df = self.df.copy() + result = df.cols.match_rating_codex(cols='*') + expected = self.create_dataframe(data={('NullType', 'object'): ['N', 'N', 'N', 'N', 'N', 'N'], ('date arrival', 'object'): ['198/10', '198/10', '198/10', '198/10', '198/10', '198/10'], ('height(ft)', 'object'): ['-28.0', '17.0', '26.0', '13.0', 'N', '30.0'], ('last date seen', 'object'): ['201/10', '201/10', '201/10', '201/10', '201/10', '201/10'], ('last position seen', 'object'): ['19.201', '10.534', '37.356', '3.6153', 'N', 'N'], ('rank', 'object'): ['10', '7', '7', '8', '10', '8'], ('Cybertronian', 'object'): ['TR', 'TR', 'TR', 'TR', 'TR', 'FLS'], ('Date Type', 'object'): ['201-10', '201-10', '201-24', '201-24', '201-10', '201-10'], ('age', 'object'): ['50', '50', '50', '50', '50', '50'], ('function', 'object'): ['LDR', 'ESPNG', 'SCRTY', 'FRSTNT', 'N', 'BTLSTN'], ('names', 'object'): ['OPTMS', 'BMB#BÉ', 'MTRPLX', 'BMBLB', 'MÉTL-X', 'MTR)^$'], ('timestamp', 'object'): ['201-24', '201-24', '201-24', '201-24', '201-24', '201-24'], ('weight(t)', 'object'): ['4.3', '2.0', '4.0', '1.8', '5.7', 'N']}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_match_rating_codex_multiple(self): + df = self.df.copy() + result = df.cols.match_rating_codex(cols=['NullType', 'Cybertronian', 'timestamp'], output_cols=['nt', 'ct', 'ts']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('nt', 'object'): ['N', 'N', 'N', 'N', 'N', 'N'], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('ct', 'object'): ['TR', 'TR', 'TR', 'TR', 'TR', 'FLS'], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('ts', 'object'): ['201-24', '201-24', '201-24', '201-24', '201-24', '201-24'], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_match_rating_codex_numeric(self): + df = self.df.copy() + result = df.cols.match_rating_codex(cols=['rank'], output_cols=['rk']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('rk', 'object'): ['10', '7', '7', '8', '10', '8'], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_match_rating_codex_string(self): + df = self.df.copy() + result = df.cols.match_rating_codex(cols=['names']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['OPTMS', 'BMB#BÉ', 'MTRPLX', 'BMBLB', 'MÉTL-X', 'MTR)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_metaphone_all(self): + df = self.df.copy() + result = df.cols.metaphone(cols='*') + expected = self.create_dataframe(data={('NullType', 'object'): ['NN', 'NN', 'NN', 'NN', 'NN', 'NN'], ('date arrival', 'object'): ['', '', '', '', '', ''], ('height(ft)', 'object'): ['', '', '', '', 'NN', ''], ('last date seen', 'object'): ['', '', '', '', '', ''], ('last position seen', 'object'): ['', '', '', '', 'NN', 'NN'], ('rank', 'object'): ['', '', '', '', '', ''], ('Cybertronian', 'object'): ['TR', 'TR', 'TR', 'TR', 'TR', 'FLS'], ('Date Type', 'object'): ['', '', '', '', '', ''], ('age', 'object'): ['', '', '', '', '', ''], ('function', 'object'): ['LTR', 'ESPNJ', 'SKRT', 'FRST LTNNT', 'NN', 'BTL STXN'], ('names', 'object'): ['OPTMS', 'BMBLB ', 'MTRPLKS', 'BMBLB', 'MTRP LKS', 'MTRPLKS'], ('timestamp', 'object'): ['', '', '', '', '', ''], ('weight(t)', 'object'): ['', '', '', '', '', 'NN']}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_metaphone_multiple(self): + df = self.df.copy() + result = df.cols.metaphone(cols=['NullType', 'Cybertronian', 'timestamp'], output_cols=['nt', 'ct', 'ts']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('nt', 'object'): ['NN', 'NN', 'NN', 'NN', 'NN', 'NN'], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('ct', 'object'): ['TR', 'TR', 'TR', 'TR', 'TR', 'FLS'], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('ts', 'object'): ['', '', '', '', '', ''], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_metaphone_numeric(self): + df = self.df.copy() + result = df.cols.metaphone(cols=['rank'], output_cols=['rk']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('rk', 'object'): ['', '', '', '', '', ''], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_metaphone_string(self): + df = self.df.copy() + result = df.cols.metaphone(cols=['names']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['OPTMS', 'BMBLB ', 'MTRPLKS', 'BMBLB', 'MTRP LKS', 'MTRPLKS'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_ngram_fingerprint_all(self): + df = self.df.copy() + result = df.cols.ngram_fingerprint(cols='*') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_ngram_fingerprint_multiple(self): + df = self.df.copy() + result = df.cols.ngram_fingerprint(cols=['NullType', 'Cybertronian', 'timestamp'], n_size=4, output_cols=['nt', 'ct', 'ts']) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_ngram_fingerprint_numeric(self): + df = self.df.copy() + result = df.cols.ngram_fingerprint(cols=['rank'], output_cols=['rk']) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_ngram_fingerprint_string(self): + df = self.df.copy() + result = df.cols.ngram_fingerprint(cols=['function(binary)'], n_size=25) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_ngrams_multiple(self): + df = self.df.copy() + result = df.cols.ngrams(cols=['date arrival', 'japanese name', 'last date seen'], n_size=1, output_cols=['da', 'jn', 'lds']) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_ngrams_single(self): + df = self.df.copy() + result = df.cols.ngrams(cols=['names']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): [['Op', 'pt', 'ti', 'im', 'mu', 'us'], ['bu', 'um', 'mb', 'bl', 'l#', '#e', 'eb', 'bé', 'éé', 'é ', ' '], ['Me', 'et', 'tr', 'ro', 'op', 'pl', 'le', 'ex'], ['bu', 'um', 'mb', 'bl', 'le', 'eb', 'be', 'ee'], ['mé', 'ét', 'tr', 'ro', 'op', 'p´', '´l', 'le', 'e-', '-x'], ['Me', 'et', 'tr', 'ro', 'op', 'pl', 'le', 'ex', 'x_', '_)', ')^', '^$']], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_nysiis_all(self): + df = self.df.copy() + result = df.cols.nysiis(cols='*') + expected = self.create_dataframe(data={('NullType', 'object'): ['NAN', 'NAN', 'NAN', 'NAN', 'NAN', 'NAN'], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'object'): ['-28.0', '17.0', '26.0', '13.0', 'NAN', '30.0'], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '201/04/10'], ('last position seen', 'object'): ['19.42735,-9.201', '10.642707,-71.612534', '37.789563,-12.40356', '3.6706,-17.84153', 'NAN', 'NAN'], ('rank', 'object'): ['10', '7', '7', '8', '10', '8'], ('Cybertronian', 'object'): ['TR', 'TR', 'TR', 'TR', 'TR', 'FALS'], ('Date Type', 'object'): ['2016-09-10', '2015-08-10', '2014-06-24', '2013-06-24', '2012-05-10', '201-04-10'], ('age', 'object'): ['50', '50', '50', '50', '50', '50'], ('function', 'object'): ['LADAR', 'ESPANAG', 'SACARATY', 'FARST', 'NAN', 'BATL'], ('names', 'object'): ['OPTAN', 'BANBL#ABÉ', 'MATRAPLAX', 'BANBLABY', 'MÉTRAP´LA-X', 'MATRAPLAX_)^$'], ('timestamp', 'object'): ['2014-06-24', '2014-06-24', '2014-06-24', '2014-06-24', '2014-06-24', '2014-06-24'], ('weight(t)', 'object'): ['4.3', '2.0', '4.0', '1.8', '5.7', 'NAN']}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_nysiis_multiple(self): + df = self.df.copy() + result = df.cols.nysiis(cols=['NullType', 'Cybertronian', 'timestamp'], output_cols=['nt', 'ct', 'ts']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('nt', 'object'): ['NAN', 'NAN', 'NAN', 'NAN', 'NAN', 'NAN'], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('ct', 'object'): ['TR', 'TR', 'TR', 'TR', 'TR', 'FALS'], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('ts', 'object'): ['2014-06-24', '2014-06-24', '2014-06-24', '2014-06-24', '2014-06-24', '2014-06-24'], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_nysiis_numeric(self): + df = self.df.copy() + result = df.cols.nysiis(cols=['rank'], output_cols=['rk']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('rk', 'object'): ['10', '7', '7', '8', '10', '8'], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_nysiis_string(self): + df = self.df.copy() + result = df.cols.nysiis(cols=['names']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['OPTAN', 'BANBL#ABÉ', 'MATRAPLAX', 'BANBLABY', 'MÉTRAP´LA-X', 'MATRAPLAX_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_pos_multiple(self): + df = self.df.copy() + result = df.cols.pos(cols=['date arrival', 'japanese name', 'last date seen'], output_cols=['da', 'jn', 'lds']) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_pos_single(self): + df = self.df.copy() + result = df.cols.pos(cols=['names']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): [[('Optimus', 'NN')], [('bumbl#ebéé', 'NN')], [('Metroplex', 'NNP')], [('bumblebee', 'NN')], [('métrop´le-x', 'NN')], [('Metroplex_)^$', 'NN')]], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_soundex_all(self): + df = self.df.copy() + result = df.cols.soundex(cols='*') + expected = self.create_dataframe(data={('NullType', 'object'): ['N500', 'N500', 'N500', 'N500', 'N500', 'N500'], ('date arrival', 'object'): ['1000', '1000', '1000', '1000', '1000', '1000'], ('height(ft)', 'object'): ['-000', '1000', '2000', '1000', 'N500', '3000'], ('last date seen', 'object'): ['2000', '2000', '2000', '2000', '2000', '2000'], ('last position seen', 'object'): ['1000', '1000', '3000', '3000', 'N500', 'N500'], ('rank', 'object'): ['1000', '7000', '7000', '8000', '1000', '8000'], ('Cybertronian', 'object'): ['T600', 'T600', 'T600', 'T600', 'T600', 'F420'], ('Date Type', 'object'): ['2000', '2000', '2000', '2000', '2000', '2000'], ('age', 'object'): ['5000', '5000', '5000', '5000', '5000', '5000'], ('function', 'object'): ['L360', 'E215', 'S263', 'F623', 'N500', 'B342'], ('names', 'object'): ['O135', 'B514', 'M361', 'B514', 'M361', 'M361'], ('timestamp', 'object'): ['2000', '2000', '2000', '2000', '2000', '2000'], ('weight(t)', 'object'): ['4000', '2000', '4000', '1000', '5000', 'N500']}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_soundex_multiple(self): + df = self.df.copy() + result = df.cols.soundex(cols=['NullType', 'Cybertronian', 'timestamp'], output_cols=['nt', 'ct', 'ts']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('nt', 'object'): ['N500', 'N500', 'N500', 'N500', 'N500', 'N500'], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('ct', 'object'): ['T600', 'T600', 'T600', 'T600', 'T600', 'F420'], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('ts', 'object'): ['2000', '2000', '2000', '2000', '2000', '2000'], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_soundex_numeric(self): + df = self.df.copy() + result = df.cols.soundex(cols=['rank'], output_cols=['rk']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('rk', 'object'): ['1000', '7000', '7000', '8000', '1000', '8000'], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_soundex_string(self): + df = self.df.copy() + result = df.cols.soundex(cols=['names']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['O135', 'B514', 'M361', 'B514', 'M361', 'M361'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_all_double_metaphone(self): + df = self.df.copy() + result = df.string_clustering(cols='*', algorithm='double_metaphone') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_all_fingerprint(self): + df = self.df.copy() + result = df.string_clustering(cols='*', algorithm='fingerprint') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_all_levenshtein(self): + df = self.df.copy() + result = df.string_clustering(cols='*', algorithm='levenshtein') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_all_match_rating_codex(self): + df = self.df.copy() + result = df.string_clustering(cols='*', algorithm='match_rating_codex') + expected = { 'Cybertronian': { 'FLS': { 'suggestion': 'False', + 'suggestions': ['False'], + 'suggestions_size': 1, + 'total_count': 1}, + 'TR': { 'suggestion': 'True', + 'suggestions': ['True'], + 'suggestions_size': 1, + 'total_count': 5}}, + 'Date Type': { '201-10': { 'suggestion': '2016-09-10', + 'suggestions': [ '2016-09-10', + '2015-08-10', + '2012-05-10', + '2011-04-10'], + 'suggestions_size': 4, + 'total_count': 4}, + '201-24': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24', '2013-06-24'], + 'suggestions_size': 2, + 'total_count': 2}}, + 'NullType': { 'N': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'age': { '50': { 'suggestion': '5000000', + 'suggestions': ['5000000'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'date arrival': { '198/10': { 'suggestion': '1980/04/10', + 'suggestions': ['1980/04/10'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'function': { 'BTLSTN': { 'suggestion': 'Battle Station', + 'suggestions': ['Battle Station'], + 'suggestions_size': 1, + 'total_count': 1}, + 'ESPNG': { 'suggestion': 'Espionage', + 'suggestions': ['Espionage'], + 'suggestions_size': 1, + 'total_count': 1}, + 'FRSTNT': { 'suggestion': 'First Lieutenant', + 'suggestions': ['First Lieutenant'], + 'suggestions_size': 1, + 'total_count': 1}, + 'LDR': { 'suggestion': 'Leader', + 'suggestions': ['Leader'], + 'suggestions_size': 1, + 'total_count': 1}, + 'N': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 1}, + 'SCRTY': { 'suggestion': 'Security', + 'suggestions': ['Security'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'height(ft)': { '-28.0': { 'suggestion': '-28.0', + 'suggestions': ['-28.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '13.0': { 'suggestion': '13.0', + 'suggestions': ['13.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '17.0': { 'suggestion': '17.0', + 'suggestions': ['17.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '26.0': { 'suggestion': '26.0', + 'suggestions': ['26.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '30.0': { 'suggestion': '300.0', + 'suggestions': ['300.0'], + 'suggestions_size': 1, + 'total_count': 1}, + 'N': { 'suggestion': 'nan', + 'suggestions': ['nan'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'last date seen': { '201/10': { 'suggestion': '2016/09/10', + 'suggestions': [ '2016/09/10', + '2015/08/10', + '2014/07/10', + '2013/06/10', + '2012/05/10', + '2011/04/10'], + 'suggestions_size': 6, + 'total_count': 6}}, + 'last position seen': { '10.534': { 'suggestion': '10.642707,-71.612534', + 'suggestions': ['10.642707,-71.612534'], + 'suggestions_size': 1, + 'total_count': 1}, + '19.201': { 'suggestion': '19.442735,-99.201111', + 'suggestions': ['19.442735,-99.201111'], + 'suggestions_size': 1, + 'total_count': 1}, + '3.6153': { 'suggestion': '33.670666,-117.841553', + 'suggestions': ['33.670666,-117.841553'], + 'suggestions_size': 1, + 'total_count': 1}, + '37.356': { 'suggestion': '37.789563,-122.400356', + 'suggestions': ['37.789563,-122.400356'], + 'suggestions_size': 1, + 'total_count': 1}, + 'N': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 2}}, + 'names': { 'BMB#BÉ': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], + 'suggestions_size': 1, + 'total_count': 1}, + 'BMBLB': { 'suggestion': 'bumblebee', + 'suggestions': ['bumblebee'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MTR)^$': { 'suggestion': 'Metroplex_)^$', + 'suggestions': ['Metroplex_)^$'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MTRPLX': { 'suggestion': 'Metroplex', + 'suggestions': ['Metroplex'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MÉTL-X': { 'suggestion': 'métrop´le-x', + 'suggestions': ['métrop´le-x'], + 'suggestions_size': 1, + 'total_count': 1}, + 'OPTMS': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'rank': { '10': { 'suggestion': '10', + 'suggestions': ['10'], + 'suggestions_size': 1, + 'total_count': 2}, + '7': { 'suggestion': '7', + 'suggestions': ['7'], + 'suggestions_size': 1, + 'total_count': 2}, + '8': { 'suggestion': '8', + 'suggestions': ['8'], + 'suggestions_size': 1, + 'total_count': 2}}, + 'timestamp': { '201-24': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'weight(t)': { '1.8': { 'suggestion': '1.8', + 'suggestions': ['1.8'], + 'suggestions_size': 1, + 'total_count': 1}, + '2.0': { 'suggestion': '2.0', + 'suggestions': ['2.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '4.0': { 'suggestion': '4.0', + 'suggestions': ['4.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '4.3': { 'suggestion': '4.3', + 'suggestions': ['4.3'], + 'suggestions_size': 1, + 'total_count': 1}, + '5.7': { 'suggestion': '5.7', + 'suggestions': ['5.7'], + 'suggestions_size': 1, + 'total_count': 1}, + 'N': { 'suggestion': 'nan', + 'suggestions': ['nan'], + 'suggestions_size': 1, + 'total_count': 1}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_all_metaphone(self): + df = self.df.copy() + result = df.string_clustering(cols='*', algorithm='metaphone') + expected = { 'Cybertronian': { 'FLS': { 'suggestion': 'False', + 'suggestions': ['False'], + 'suggestions_size': 1, + 'total_count': 1}, + 'TR': { 'suggestion': 'True', + 'suggestions': ['True'], + 'suggestions_size': 1, + 'total_count': 5}}, + 'Date Type': { '': { 'suggestion': '2016-09-10', + 'suggestions': [ '2016-09-10', + '2015-08-10', + '2014-06-24', + '2013-06-24', + '2012-05-10', + '2011-04-10'], + 'suggestions_size': 6, + 'total_count': 6}}, + 'NullType': { 'NN': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'age': { '': { 'suggestion': '5000000', + 'suggestions': ['5000000'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'date arrival': { '': { 'suggestion': '1980/04/10', + 'suggestions': ['1980/04/10'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'function': { 'BTL STXN': { 'suggestion': 'Battle Station', + 'suggestions': ['Battle Station'], + 'suggestions_size': 1, + 'total_count': 1}, + 'ESPNJ': { 'suggestion': 'Espionage', + 'suggestions': ['Espionage'], + 'suggestions_size': 1, + 'total_count': 1}, + 'FRST LTNNT': { 'suggestion': 'First Lieutenant', + 'suggestions': ['First Lieutenant'], + 'suggestions_size': 1, + 'total_count': 1}, + 'LTR': { 'suggestion': 'Leader', + 'suggestions': ['Leader'], + 'suggestions_size': 1, + 'total_count': 1}, + 'NN': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 1}, + 'SKRT': { 'suggestion': 'Security', + 'suggestions': ['Security'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'height(ft)': { '': { 'suggestion': '-28.0', + 'suggestions': [ '-28.0', + '17.0', + '26.0', + '13.0', + '300.0'], + 'suggestions_size': 5, + 'total_count': 5}, + 'NN': { 'suggestion': 'nan', + 'suggestions': ['nan'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'last date seen': { '': { 'suggestion': '2016/09/10', + 'suggestions': [ '2016/09/10', + '2015/08/10', + '2014/07/10', + '2013/06/10', + '2012/05/10', + '2011/04/10'], + 'suggestions_size': 6, + 'total_count': 6}}, + 'last position seen': { '': { 'suggestion': '19.442735,-99.201111', + 'suggestions': [ '19.442735,-99.201111', + '10.642707,-71.612534', + '37.789563,-122.400356', + '33.670666,-117.841553'], + 'suggestions_size': 4, + 'total_count': 4}, + 'NN': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 2}}, + 'names': { 'BMBLB': { 'suggestion': 'bumblebee', + 'suggestions': ['bumblebee'], + 'suggestions_size': 1, + 'total_count': 1}, + 'BMBLB ': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], + 'suggestions_size': 1, + 'total_count': 1}, + 'MTRP LKS': { 'suggestion': 'métrop´le-x', + 'suggestions': ['métrop´le-x'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MTRPLKS': { 'suggestion': 'Metroplex', + 'suggestions': ['Metroplex', 'Metroplex_)^$'], + 'suggestions_size': 2, + 'total_count': 2}, + 'OPTMS': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'rank': { '': { 'suggestion': '10', + 'suggestions': ['10', '7', '8'], + 'suggestions_size': 3, + 'total_count': 6}}, + 'timestamp': { '': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'weight(t)': { '': { 'suggestion': '4.3', + 'suggestions': ['4.3', '2.0', '4.0', '1.8', '5.7'], + 'suggestions_size': 5, + 'total_count': 5}, + 'NN': { 'suggestion': 'nan', + 'suggestions': ['nan'], + 'suggestions_size': 1, + 'total_count': 1}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_all_ngram_fingerprint(self): + df = self.df.copy() + result = df.string_clustering(cols='*', algorithm='ngram_fingerprint') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_all_nysiis(self): + df = self.df.copy() + result = df.string_clustering(cols='*', algorithm='nysiis') + expected = { 'Cybertronian': { 'FALS': { 'suggestion': 'False', + 'suggestions': ['False'], + 'suggestions_size': 1, + 'total_count': 1}, + 'TR': { 'suggestion': 'True', + 'suggestions': ['True'], + 'suggestions_size': 1, + 'total_count': 5}}, + 'Date Type': { '201-04-10': { 'suggestion': '2011-04-10', + 'suggestions': ['2011-04-10'], + 'suggestions_size': 1, + 'total_count': 1}, + '2012-05-10': { 'suggestion': '2012-05-10', + 'suggestions': ['2012-05-10'], + 'suggestions_size': 1, + 'total_count': 1}, + '2013-06-24': { 'suggestion': '2013-06-24', + 'suggestions': ['2013-06-24'], + 'suggestions_size': 1, + 'total_count': 1}, + '2014-06-24': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24'], + 'suggestions_size': 1, + 'total_count': 1}, + '2015-08-10': { 'suggestion': '2015-08-10', + 'suggestions': ['2015-08-10'], + 'suggestions_size': 1, + 'total_count': 1}, + '2016-09-10': { 'suggestion': '2016-09-10', + 'suggestions': ['2016-09-10'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'NullType': { 'NAN': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'age': { '50': { 'suggestion': '5000000', + 'suggestions': ['5000000'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'date arrival': { '1980/04/10': { 'suggestion': '1980/04/10', + 'suggestions': ['1980/04/10'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'function': { 'BATL': { 'suggestion': 'Battle Station', + 'suggestions': ['Battle Station'], + 'suggestions_size': 1, + 'total_count': 1}, + 'ESPANAG': { 'suggestion': 'Espionage', + 'suggestions': ['Espionage'], + 'suggestions_size': 1, + 'total_count': 1}, + 'FARST': { 'suggestion': 'First Lieutenant', + 'suggestions': ['First Lieutenant'], + 'suggestions_size': 1, + 'total_count': 1}, + 'LADAR': { 'suggestion': 'Leader', + 'suggestions': ['Leader'], + 'suggestions_size': 1, + 'total_count': 1}, + 'NAN': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 1}, + 'SACARATY': { 'suggestion': 'Security', + 'suggestions': ['Security'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'height(ft)': { '-28.0': { 'suggestion': '-28.0', + 'suggestions': ['-28.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '13.0': { 'suggestion': '13.0', + 'suggestions': ['13.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '17.0': { 'suggestion': '17.0', + 'suggestions': ['17.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '26.0': { 'suggestion': '26.0', + 'suggestions': ['26.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '30.0': { 'suggestion': '300.0', + 'suggestions': ['300.0'], + 'suggestions_size': 1, + 'total_count': 1}, + 'NAN': { 'suggestion': 'nan', + 'suggestions': ['nan'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'last date seen': { '201/04/10': { 'suggestion': '2011/04/10', + 'suggestions': ['2011/04/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '2012/05/10': { 'suggestion': '2012/05/10', + 'suggestions': ['2012/05/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '2013/06/10': { 'suggestion': '2013/06/10', + 'suggestions': ['2013/06/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '2014/07/10': { 'suggestion': '2014/07/10', + 'suggestions': ['2014/07/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '2015/08/10': { 'suggestion': '2015/08/10', + 'suggestions': ['2015/08/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '2016/09/10': { 'suggestion': '2016/09/10', + 'suggestions': ['2016/09/10'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'last position seen': { '10.642707,-71.612534': { 'suggestion': '10.642707,-71.612534', + 'suggestions': [ '10.642707,-71.612534'], + 'suggestions_size': 1, + 'total_count': 1}, + '19.42735,-9.201': { 'suggestion': '19.442735,-99.201111', + 'suggestions': [ '19.442735,-99.201111'], + 'suggestions_size': 1, + 'total_count': 1}, + '3.6706,-17.84153': { 'suggestion': '33.670666,-117.841553', + 'suggestions': [ '33.670666,-117.841553'], + 'suggestions_size': 1, + 'total_count': 1}, + '37.789563,-12.40356': { 'suggestion': '37.789563,-122.400356', + 'suggestions': [ '37.789563,-122.400356'], + 'suggestions_size': 1, + 'total_count': 1}, + 'NAN': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 2}}, + 'names': { 'BANBL#ABÉ': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], + 'suggestions_size': 1, + 'total_count': 1}, + 'BANBLABY': { 'suggestion': 'bumblebee', + 'suggestions': ['bumblebee'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MATRAPLAX': { 'suggestion': 'Metroplex', + 'suggestions': ['Metroplex'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MATRAPLAX_)^$': { 'suggestion': 'Metroplex_)^$', + 'suggestions': ['Metroplex_)^$'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MÉTRAP´LA-X': { 'suggestion': 'métrop´le-x', + 'suggestions': ['métrop´le-x'], + 'suggestions_size': 1, + 'total_count': 1}, + 'OPTAN': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'rank': { '10': { 'suggestion': '10', + 'suggestions': ['10'], + 'suggestions_size': 1, + 'total_count': 2}, + '7': { 'suggestion': '7', + 'suggestions': ['7'], + 'suggestions_size': 1, + 'total_count': 2}, + '8': { 'suggestion': '8', + 'suggestions': ['8'], + 'suggestions_size': 1, + 'total_count': 2}}, + 'timestamp': { '2014-06-24': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'weight(t)': { '1.8': { 'suggestion': '1.8', + 'suggestions': ['1.8'], + 'suggestions_size': 1, + 'total_count': 1}, + '2.0': { 'suggestion': '2.0', + 'suggestions': ['2.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '4.0': { 'suggestion': '4.0', + 'suggestions': ['4.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '4.3': { 'suggestion': '4.3', + 'suggestions': ['4.3'], + 'suggestions_size': 1, + 'total_count': 1}, + '5.7': { 'suggestion': '5.7', + 'suggestions': ['5.7'], + 'suggestions_size': 1, + 'total_count': 1}, + 'NAN': { 'suggestion': 'nan', + 'suggestions': ['nan'], + 'suggestions_size': 1, + 'total_count': 1}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_all_soundex(self): + df = self.df.copy() + result = df.string_clustering(cols='*', algorithm='soundex') + expected = { 'Cybertronian': { 'F420': { 'suggestion': 'False', + 'suggestions': ['False'], + 'suggestions_size': 1, + 'total_count': 1}, + 'T600': { 'suggestion': 'True', + 'suggestions': ['True'], + 'suggestions_size': 1, + 'total_count': 5}}, + 'Date Type': { '2000': { 'suggestion': '2016-09-10', + 'suggestions': [ '2016-09-10', + '2015-08-10', + '2014-06-24', + '2013-06-24', + '2012-05-10', + '2011-04-10'], + 'suggestions_size': 6, + 'total_count': 6}}, + 'NullType': { 'N500': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'age': { '5000': { 'suggestion': '5000000', + 'suggestions': ['5000000'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'date arrival': { '1000': { 'suggestion': '1980/04/10', + 'suggestions': ['1980/04/10'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'function': { 'B342': { 'suggestion': 'Battle Station', + 'suggestions': ['Battle Station'], + 'suggestions_size': 1, + 'total_count': 1}, + 'E215': { 'suggestion': 'Espionage', + 'suggestions': ['Espionage'], + 'suggestions_size': 1, + 'total_count': 1}, + 'F623': { 'suggestion': 'First Lieutenant', + 'suggestions': ['First Lieutenant'], + 'suggestions_size': 1, + 'total_count': 1}, + 'L360': { 'suggestion': 'Leader', + 'suggestions': ['Leader'], + 'suggestions_size': 1, + 'total_count': 1}, + 'N500': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 1}, + 'S263': { 'suggestion': 'Security', + 'suggestions': ['Security'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'height(ft)': { '-000': { 'suggestion': '-28.0', + 'suggestions': ['-28.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '1000': { 'suggestion': '17.0', + 'suggestions': ['17.0', '13.0'], + 'suggestions_size': 2, + 'total_count': 2}, + '2000': { 'suggestion': '26.0', + 'suggestions': ['26.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '3000': { 'suggestion': '300.0', + 'suggestions': ['300.0'], + 'suggestions_size': 1, + 'total_count': 1}, + 'N500': { 'suggestion': 'nan', + 'suggestions': ['nan'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'last date seen': { '2000': { 'suggestion': '2016/09/10', + 'suggestions': [ '2016/09/10', + '2015/08/10', + '2014/07/10', + '2013/06/10', + '2012/05/10', + '2011/04/10'], + 'suggestions_size': 6, + 'total_count': 6}}, + 'last position seen': { '1000': { 'suggestion': '19.442735,-99.201111', + 'suggestions': [ '19.442735,-99.201111', + '10.642707,-71.612534'], + 'suggestions_size': 2, + 'total_count': 2}, + '3000': { 'suggestion': '37.789563,-122.400356', + 'suggestions': [ '37.789563,-122.400356', + '33.670666,-117.841553'], + 'suggestions_size': 2, + 'total_count': 2}, + 'N500': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 2}}, + 'names': { 'B514': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé ', 'bumblebee'], + 'suggestions_size': 2, + 'total_count': 2}, + 'M361': { 'suggestion': 'Metroplex', + 'suggestions': [ 'Metroplex', + 'métrop´le-x', + 'Metroplex_)^$'], + 'suggestions_size': 3, + 'total_count': 3}, + 'O135': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'rank': { '1000': { 'suggestion': '10', + 'suggestions': ['10'], + 'suggestions_size': 1, + 'total_count': 2}, + '7000': { 'suggestion': '7', + 'suggestions': ['7'], + 'suggestions_size': 1, + 'total_count': 2}, + '8000': { 'suggestion': '8', + 'suggestions': ['8'], + 'suggestions_size': 1, + 'total_count': 2}}, + 'timestamp': { '2000': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'weight(t)': { '1000': { 'suggestion': '1.8', + 'suggestions': ['1.8'], + 'suggestions_size': 1, + 'total_count': 1}, + '2000': { 'suggestion': '2.0', + 'suggestions': ['2.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '4000': { 'suggestion': '4.3', + 'suggestions': ['4.3', '4.0'], + 'suggestions_size': 2, + 'total_count': 2}, + '5000': { 'suggestion': '5.7', + 'suggestions': ['5.7'], + 'suggestions_size': 1, + 'total_count': 1}, + 'N500': { 'suggestion': 'nan', + 'suggestions': ['nan'], + 'suggestions_size': 1, + 'total_count': 1}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_multiple_double_metaphone(self): + df = self.df.copy() + result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='double_metaphone') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_multiple_fingerprint(self): + df = self.df.copy() + result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='fingerprint') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_multiple_levenshtein(self): + df = self.df.copy() + result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='levenshtein') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_multiple_match_rating_codex(self): + df = self.df.copy() + result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='match_rating_codex') + expected = { 'Cybertronian': { 'FLS': { 'suggestion': 'False', + 'suggestions': ['False'], + 'suggestions_size': 1, + 'total_count': 1}, + 'TR': { 'suggestion': 'True', + 'suggestions': ['True'], + 'suggestions_size': 1, + 'total_count': 5}}, + 'NullType': { 'N': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'timestamp': { '201-24': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24'], + 'suggestions_size': 1, + 'total_count': 6}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_multiple_metaphone(self): + df = self.df.copy() + result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='metaphone') + expected = { 'Cybertronian': { 'FLS': { 'suggestion': 'False', + 'suggestions': ['False'], + 'suggestions_size': 1, + 'total_count': 1}, + 'TR': { 'suggestion': 'True', + 'suggestions': ['True'], + 'suggestions_size': 1, + 'total_count': 5}}, + 'NullType': { 'NN': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'timestamp': { '': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24'], + 'suggestions_size': 1, + 'total_count': 6}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_multiple_ngram_fingerprint(self): + df = self.df.copy() + result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='ngram_fingerprint') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_multiple_nysiis(self): + df = self.df.copy() + result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='nysiis') + expected = { 'Cybertronian': { 'FALS': { 'suggestion': 'False', + 'suggestions': ['False'], + 'suggestions_size': 1, + 'total_count': 1}, + 'TR': { 'suggestion': 'True', + 'suggestions': ['True'], + 'suggestions_size': 1, + 'total_count': 5}}, + 'NullType': { 'NAN': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'timestamp': { '2014-06-24': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24'], + 'suggestions_size': 1, + 'total_count': 6}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_multiple_soundex(self): + df = self.df.copy() + result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='soundex') + expected = { 'Cybertronian': { 'F420': { 'suggestion': 'False', + 'suggestions': ['False'], + 'suggestions_size': 1, + 'total_count': 1}, + 'T600': { 'suggestion': 'True', + 'suggestions': ['True'], + 'suggestions_size': 1, + 'total_count': 5}}, + 'NullType': { 'N500': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'timestamp': { '2000': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24'], + 'suggestions_size': 1, + 'total_count': 6}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_numeric_double_metaphone(self): + df = self.df.copy() + result = df.string_clustering(cols=['rank'], algorithm='double_metaphone') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_numeric_fingerprint(self): + df = self.df.copy() + result = df.string_clustering(cols=['rank'], algorithm='fingerprint') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_numeric_levenshtein(self): + df = self.df.copy() + result = df.string_clustering(cols=['rank'], algorithm='levenshtein') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_numeric_match_rating_codex(self): + df = self.df.copy() + result = df.string_clustering(cols=['rank'], algorithm='match_rating_codex') + expected = { 'rank': { '10': { 'suggestion': '10', + 'suggestions': ['10'], + 'suggestions_size': 1, + 'total_count': 2}, + '7': { 'suggestion': '7', + 'suggestions': ['7'], + 'suggestions_size': 1, + 'total_count': 2}, + '8': { 'suggestion': '8', + 'suggestions': ['8'], + 'suggestions_size': 1, + 'total_count': 2}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_numeric_metaphone(self): + df = self.df.copy() + result = df.string_clustering(cols=['rank'], algorithm='metaphone') + expected = { 'rank': { '': { 'suggestion': '10', + 'suggestions': ['10', '7', '8'], + 'suggestions_size': 3, + 'total_count': 6}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_numeric_ngram_fingerprint(self): + df = self.df.copy() + result = df.string_clustering(cols=['rank'], algorithm='ngram_fingerprint') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_numeric_nysiis(self): + df = self.df.copy() + result = df.string_clustering(cols=['rank'], algorithm='nysiis') + expected = { 'rank': { '10': { 'suggestion': '10', + 'suggestions': ['10'], + 'suggestions_size': 1, + 'total_count': 2}, + '7': { 'suggestion': '7', + 'suggestions': ['7'], + 'suggestions_size': 1, + 'total_count': 2}, + '8': { 'suggestion': '8', + 'suggestions': ['8'], + 'suggestions_size': 1, + 'total_count': 2}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_numeric_soundex(self): + df = self.df.copy() + result = df.string_clustering(cols=['rank'], algorithm='soundex') + expected = { 'rank': { '1000': { 'suggestion': '10', + 'suggestions': ['10'], + 'suggestions_size': 1, + 'total_count': 2}, + '7000': { 'suggestion': '7', + 'suggestions': ['7'], + 'suggestions_size': 1, + 'total_count': 2}, + '8000': { 'suggestion': '8', + 'suggestions': ['8'], + 'suggestions_size': 1, + 'total_count': 2}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_string_double_metaphone(self): + df = self.df.copy() + result = df.string_clustering(cols=['names'], algorithm='double_metaphone') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_string_fingerprint(self): + df = self.df.copy() + result = df.string_clustering(cols=['names'], algorithm='fingerprint') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_string_levenshtein(self): + df = self.df.copy() + result = df.string_clustering(cols=['names'], algorithm='levenshtein') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_string_match_rating_codex(self): + df = self.df.copy() + result = df.string_clustering(cols=['names'], algorithm='match_rating_codex') + expected = { 'names': { 'BMB#BÉ': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], + 'suggestions_size': 1, + 'total_count': 1}, + 'BMBLB': { 'suggestion': 'bumblebee', + 'suggestions': ['bumblebee'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MTR)^$': { 'suggestion': 'Metroplex_)^$', + 'suggestions': ['Metroplex_)^$'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MTRPLX': { 'suggestion': 'Metroplex', + 'suggestions': ['Metroplex'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MÉTL-X': { 'suggestion': 'métrop´le-x', + 'suggestions': ['métrop´le-x'], + 'suggestions_size': 1, + 'total_count': 1}, + 'OPTMS': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_string_metaphone(self): + df = self.df.copy() + result = df.string_clustering(cols=['names'], algorithm='metaphone') + expected = { 'names': { 'BMBLB': { 'suggestion': 'bumblebee', + 'suggestions': ['bumblebee'], + 'suggestions_size': 1, + 'total_count': 1}, + 'BMBLB ': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], + 'suggestions_size': 1, + 'total_count': 1}, + 'MTRP LKS': { 'suggestion': 'métrop´le-x', + 'suggestions': ['métrop´le-x'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MTRPLKS': { 'suggestion': 'Metroplex', + 'suggestions': ['Metroplex', 'Metroplex_)^$'], + 'suggestions_size': 2, + 'total_count': 2}, + 'OPTMS': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_string_ngram_fingerprint(self): + df = self.df.copy() + result = df.string_clustering(cols=['names'], algorithm='ngram_fingerprint') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_string_nysiis(self): + df = self.df.copy() + result = df.string_clustering(cols=['names'], algorithm='nysiis') + expected = { 'names': { 'BANBL#ABÉ': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], + 'suggestions_size': 1, + 'total_count': 1}, + 'BANBLABY': { 'suggestion': 'bumblebee', + 'suggestions': ['bumblebee'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MATRAPLAX': { 'suggestion': 'Metroplex', + 'suggestions': ['Metroplex'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MATRAPLAX_)^$': { 'suggestion': 'Metroplex_)^$', + 'suggestions': ['Metroplex_)^$'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MÉTRAP´LA-X': { 'suggestion': 'métrop´le-x', + 'suggestions': ['métrop´le-x'], + 'suggestions_size': 1, + 'total_count': 1}, + 'OPTAN': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_string_soundex(self): + df = self.df.copy() + result = df.string_clustering(cols=['names'], algorithm='soundex') + expected = { 'names': { 'B514': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé ', 'bumblebee'], + 'suggestions_size': 2, + 'total_count': 2}, + 'M361': { 'suggestion': 'Metroplex', + 'suggestions': [ 'Metroplex', + 'métrop´le-x', + 'Metroplex_)^$'], + 'suggestions_size': 3, + 'total_count': 3}, + 'O135': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + +class TestStringclusteringDask(TestStringclusteringPandas): + config = {'engine': 'dask', 'n_partitions': 1} + + +class TestStringclusteringPartitionDask(TestStringclusteringPandas): + config = {'engine': 'dask', 'n_partitions': 2} + + +try: + import cudf # pyright: reportMissingImports=false +except: + pass +else: + class TestStringclusteringCUDF(TestStringclusteringPandas): + config = {'engine': 'cudf'} + + +try: + import dask_cudf # pyright: reportMissingImports=false +except: + pass +else: + class TestStringclusteringDC(TestStringclusteringPandas): + config = {'engine': 'dask_cudf', 'n_partitions': 1} + + +try: + import dask_cudf # pyright: reportMissingImports=false +except: + pass +else: + class TestStringclusteringPartitionDC(TestStringclusteringPandas): + config = {'engine': 'dask_cudf', 'n_partitions': 2} + + +try: + import pyspark # pyright: reportMissingImports=false +except: + pass +else: + class TestStringclusteringSpark(TestStringclusteringPandas): + config = {'engine': 'spark'} + + +try: + import vaex # pyright: reportMissingImports=false +except: + pass +else: + class TestStringclusteringVaex(TestStringclusteringPandas): + config = {'engine': 'vaex'}