From 7122da29e556f9cd6d6c9ea34e93610c9f459f19 Mon Sep 17 00:00:00 2001 From: Jose Angel Hernao Date: Fri, 17 Sep 2021 13:59:15 -0500 Subject: [PATCH 1/2] Add string clustering tests --- tests/creators/creator_stringclustering.py | 67 + tests/test_created__stringclustering.py | 1395 ++++++++++++++++++++ 2 files changed, 1462 insertions(+) create mode 100644 tests/creators/creator_stringclustering.py create mode 100644 tests/test_created__stringclustering.py diff --git a/tests/creators/creator_stringclustering.py b/tests/creators/creator_stringclustering.py new file mode 100644 index 000000000..8a63d1f66 --- /dev/null +++ b/tests/creators/creator_stringclustering.py @@ -0,0 +1,67 @@ +import datetime +import sys +sys.path.append("../..") + + +def create(): + from optimus import Optimus + from optimus.tests.creator import TestCreator, default_configs + + op = Optimus("pandas") + df = op.create.dataframe({ + 'NullType': [None, None, None, None, None, None], + 'date arrival': ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], + 'height(ft)': [-28, 17, 26, 13, None, 300], + ('last date seen', 'date'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], + 'last position seen': ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], + 'rank': [10, 7, 7, 8, 10, 8], + ('Cybertronian', 'bool'): [True, True, True, True, True, False], + ('Date Type'): [datetime.datetime(2016, 9, 10), datetime.datetime(2015, 8, 10), datetime.datetime(2014, 6, 24), datetime.datetime(2013, 6, 24), datetime.datetime(2012, 5, 10), datetime.datetime(2011, 4, 10)], + ('age', 'int'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], + ('function', 'string'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], + ('names', 'str'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], + ('timestamp', 'time'): [datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0)], + ('weight(t)', 'float'): [4.3, 2.0, 4.0, 1.8, 5.7, None] + }) + + t = TestCreator(op, df, name="stringclustering", configs=default_configs) + + t.create(method="string_clustering", variant="all_fingerprint", cols="*", algorithm="fingerprint") + t.create(method="string_clustering", variant="all_ngram_fingerprint", cols="*", algorithm="ngram_fingerprint") + t.create(method="string_clustering", variant="all_metaphone", cols="*", algorithm="metaphone") + t.create(method="string_clustering", variant="all_nysiis", cols="*", algorithm="nysiis") + t.create(method="string_clustering", variant="all_match_rating_codex", cols="*", algorithm="match_rating_codex") + t.create(method="string_clustering", variant="all_double_metaphone", cols="*", algorithm="double_metaphone") + t.create(method="string_clustering", variant="all_soundex", cols="*", algorithm="soundex") + t.create(method="string_clustering", variant="all_levenshtein", cols="*", algorithm="levenshtein") + + t.create(method="string_clustering", variant="numeric_fingerprint", cols=["rank"], algorithm="fingerprint") + t.create(method="string_clustering", variant="numeric_ngram_fingerprint", cols=["rank"], algorithm="ngram_fingerprint") + t.create(method="string_clustering", variant="numeric_metaphone", cols=["rank"], algorithm="metaphone") + t.create(method="string_clustering", variant="numeric_nysiis", cols=["rank"], algorithm="nysiis") + t.create(method="string_clustering", variant="numeric_match_rating_codex", cols=["rank"], algorithm="match_rating_codex") + t.create(method="string_clustering", variant="numeric_double_metaphone", cols=["rank"], algorithm="double_metaphone") + t.create(method="string_clustering", variant="numeric_soundex", cols=["rank"], algorithm="soundex") + t.create(method="string_clustering", variant="numeric_levenshtein", cols=["rank"], algorithm="levenshtein") + + t.create(method="string_clustering", variant="string_fingerprint", cols=["names"], algorithm="fingerprint") + t.create(method="string_clustering", variant="string_ngram_fingerprint", cols=["names"], algorithm="ngram_fingerprint") + t.create(method="string_clustering", variant="string_metaphone", cols=["names"], algorithm="metaphone") + t.create(method="string_clustering", variant="string_nysiis", cols=["names"], algorithm="nysiis") + t.create(method="string_clustering", variant="string_match_rating_codex", cols=["names"], algorithm="match_rating_codex") + t.create(method="string_clustering", variant="string_double_metaphone", cols=["names"], algorithm="double_metaphone") + t.create(method="string_clustering", variant="string_soundex", cols=["names"], algorithm="soundex") + t.create(method="string_clustering", variant="string_levenshtein", cols=["names"], algorithm="levenshtein") + + t.create(method="string_clustering", variant="multiple_fingerprint", cols=["NullType","Cybertronian","timestamp"], algorithm="fingerprint") + t.create(method="string_clustering", variant="multiple_ngram_fingerprint", cols=["NullType","Cybertronian","timestamp"], algorithm="ngram_fingerprint") + t.create(method="string_clustering", variant="multiple_metaphone", cols=["NullType","Cybertronian","timestamp"], algorithm="metaphone") + t.create(method="string_clustering", variant="multiple_nysiis", cols=["NullType","Cybertronian","timestamp"], algorithm="nysiis") + t.create(method="string_clustering", variant="multiple_match_rating_codex", cols=["NullType","Cybertronian","timestamp"], algorithm="match_rating_codex") + t.create(method="string_clustering", variant="multiple_double_metaphone", cols=["NullType","Cybertronian","timestamp"], algorithm="double_metaphone") + t.create(method="string_clustering", variant="multiple_soundex", cols=["NullType","Cybertronian","timestamp"], algorithm="soundex") + t.create(method="string_clustering", variant="multiple_levenshtein", cols=["NullType","Cybertronian","timestamp"], algorithm="levenshtein") + + t.run() + +create() \ No newline at end of file diff --git a/tests/test_created__stringclustering.py b/tests/test_created__stringclustering.py new file mode 100644 index 000000000..4542e89a6 --- /dev/null +++ b/tests/test_created__stringclustering.py @@ -0,0 +1,1395 @@ +import datetime +from optimus.tests.base import TestBase +from optimus.helpers.json import json_encoding +from optimus.helpers.functions import deep_sort, df_dicts_equal, results_equal + + +def Timestamp(t): + return datetime.datetime.strptime(t, "%Y-%m-%d %H:%M:%S") + + +nan = float("nan") +inf = float("inf") + + +class TestStringclusteringPandas(TestBase): + config = {'engine': 'pandas'} + dict = {('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]} + maxDiff = None + + def test_string_clustering_all_double_metaphone(self): + df = self.df + result = df.string_clustering(cols='*', algorithm='double_metaphone') + expected = { 'Cybertronian': { ('FLS', ''): { 'suggestion': False, + 'suggestions': [False], + 'suggestions_size': 1, + 'total_count': 1}, + ('TR', ''): { 'suggestion': True, + 'suggestions': [True], + 'suggestions_size': 1, + 'total_count': 5}}, + 'Date Type': { ('', ''): { 'suggestion': Timestamp('2016-09-10 00:00:00'), + 'suggestions': [ Timestamp('2016-09-10 00:00:00'), + Timestamp('2015-08-10 00:00:00'), + Timestamp('2014-06-24 00:00:00'), + Timestamp('2013-06-24 00:00:00'), + Timestamp('2012-05-10 00:00:00'), + Timestamp('2011-04-10 00:00:00')], + 'suggestions_size': 6, + 'total_count': 6}}, + 'NullType': { ('NN', ''): { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 6}}, + 'age': { ('', ''): { 'suggestion': 5000000, + 'suggestions': [5000000], + 'suggestions_size': 1, + 'total_count': 6}}, + 'date arrival': { ('', ''): { 'suggestion': '1980/04/10', + 'suggestions': ['1980/04/10'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'function': { ('ASPNJ', 'ASPNK'): { 'suggestion': 'Espionage', + 'suggestions': ['Espionage'], + 'suggestions_size': 1, + 'total_count': 1}, + ('FRSTLTNNT', ''): { 'suggestion': 'First Lieutenant', + 'suggestions': ['First Lieutenant'], + 'suggestions_size': 1, + 'total_count': 1}, + ('LTR', ''): { 'suggestion': 'Leader', + 'suggestions': ['Leader'], + 'suggestions_size': 1, + 'total_count': 1}, + ('NN', ''): { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 1}, + ('PTLSTXN', ''): { 'suggestion': 'Battle Station', + 'suggestions': ['Battle Station'], + 'suggestions_size': 1, + 'total_count': 1}, + ('SKRT', ''): { 'suggestion': 'Security', + 'suggestions': ['Security'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'height(ft)': { ('', ''): { 'suggestion': -28.0, + 'suggestions': [-28.0, 17.0, 26.0, 13.0, 300.0], + 'suggestions_size': 5, + 'total_count': 5}, + ('NN', ''): { 'suggestion': nan, + 'suggestions': [nan], + 'suggestions_size': 1, + 'total_count': 1}}, + 'last date seen': { ('', ''): { 'suggestion': '2016/09/10', + 'suggestions': [ '2016/09/10', + '2015/08/10', + '2014/07/10', + '2013/06/10', + '2012/05/10', + '2011/04/10'], + 'suggestions_size': 6, + 'total_count': 6}}, + 'last position seen': { ('', ''): { 'suggestion': '19.442735,-99.201111', + 'suggestions': [ '19.442735,-99.201111', + '10.642707,-71.612534', + '37.789563,-122.400356', + '33.670666,-117.841553'], + 'suggestions_size': 4, + 'total_count': 4}, + ('NN', ''): { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 2}}, + 'names': { ('APTMS', ''): { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}, + ('ARNT', ''): { 'suggestion': 'ironhide&', + 'suggestions': ['ironhide&'], + 'suggestions_size': 1, + 'total_count': 1}, + ('JS', 'AS'): { 'suggestion': 'Jazz', + 'suggestions': ['Jazz'], + 'suggestions_size': 1, + 'total_count': 1}, + ('MKTRN', ''): { 'suggestion': 'Megatron', + 'suggestions': ['Megatron'], + 'suggestions_size': 1, + 'total_count': 1}, + ('MTRPLKSKSKSKSKS', ''): { 'suggestion': 'Metroplex_)^$', + 'suggestions': ['Metroplex_)^$'], + 'suggestions_size': 1, + 'total_count': 1}, + ('PMPLLP', ''): { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], + 'suggestions_size': 1, + 'total_count': 1}}, + 'rank': { ('', ''): { 'suggestion': 10, + 'suggestions': [10, 7, 8], + 'suggestions_size': 3, + 'total_count': 6}}, + 'timestamp': { ('', ''): { 'suggestion': Timestamp('2014-06-24 00:00:00'), + 'suggestions': [Timestamp('2014-06-24 00:00:00')], + 'suggestions_size': 1, + 'total_count': 6}}, + 'weight(t)': { ('', ''): { 'suggestion': 4.3, + 'suggestions': [4.3, 2.0, 4.0, 1.8, 5.7], + 'suggestions_size': 5, + 'total_count': 5}, + ('NN', ''): { 'suggestion': nan, + 'suggestions': [nan], + 'suggestions_size': 1, + 'total_count': 1}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_all_fingerprint(self): + df = self.df + result = df.string_clustering(cols='*', algorithm='fingerprint') + expected = { 'Cybertronian': { 'false': { 'suggestion': False, + 'suggestions': [False], + 'suggestions_size': 1, + 'total_count': 1}, + 'true': { 'suggestion': True, + 'suggestions': [True], + 'suggestions_size': 1, + 'total_count': 5}}, + 'Date Type': { '20110410': { 'suggestion': Timestamp('2011-04-10 00:00:00'), + 'suggestions': [ Timestamp('2011-04-10 00:00:00')], + 'suggestions_size': 1, + 'total_count': 1}, + '20120510': { 'suggestion': Timestamp('2012-05-10 00:00:00'), + 'suggestions': [ Timestamp('2012-05-10 00:00:00')], + 'suggestions_size': 1, + 'total_count': 1}, + '20130624': { 'suggestion': Timestamp('2013-06-24 00:00:00'), + 'suggestions': [ Timestamp('2013-06-24 00:00:00')], + 'suggestions_size': 1, + 'total_count': 1}, + '20140624': { 'suggestion': Timestamp('2014-06-24 00:00:00'), + 'suggestions': [ Timestamp('2014-06-24 00:00:00')], + 'suggestions_size': 1, + 'total_count': 1}, + '20150810': { 'suggestion': Timestamp('2015-08-10 00:00:00'), + 'suggestions': [ Timestamp('2015-08-10 00:00:00')], + 'suggestions_size': 1, + 'total_count': 1}, + '20160910': { 'suggestion': Timestamp('2016-09-10 00:00:00'), + 'suggestions': [ Timestamp('2016-09-10 00:00:00')], + 'suggestions_size': 1, + 'total_count': 1}}, + 'NullType': { 'none': { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 6}}, + 'age': { '5000000': { 'suggestion': 5000000, + 'suggestions': [5000000], + 'suggestions_size': 1, + 'total_count': 6}}, + 'date arrival': { '19800410': { 'suggestion': '1980/04/10', + 'suggestions': ['1980/04/10'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'function': { 'battle station': { 'suggestion': 'Battle Station', + 'suggestions': ['Battle Station'], + 'suggestions_size': 1, + 'total_count': 1}, + 'espionage': { 'suggestion': 'Espionage', + 'suggestions': ['Espionage'], + 'suggestions_size': 1, + 'total_count': 1}, + 'first lieutenant': { 'suggestion': 'First Lieutenant', + 'suggestions': ['First Lieutenant'], + 'suggestions_size': 1, + 'total_count': 1}, + 'leader': { 'suggestion': 'Leader', + 'suggestions': ['Leader'], + 'suggestions_size': 1, + 'total_count': 1}, + 'none': { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 1}, + 'security': { 'suggestion': 'Security', + 'suggestions': ['Security'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'height(ft)': { '130': { 'suggestion': 13.0, + 'suggestions': [13.0], + 'suggestions_size': 1, + 'total_count': 1}, + '170': { 'suggestion': 17.0, + 'suggestions': [17.0], + 'suggestions_size': 1, + 'total_count': 1}, + '260': { 'suggestion': 26.0, + 'suggestions': [26.0], + 'suggestions_size': 1, + 'total_count': 1}, + '280': { 'suggestion': -28.0, + 'suggestions': [-28.0], + 'suggestions_size': 1, + 'total_count': 1}, + '3000': { 'suggestion': 300.0, + 'suggestions': [300.0], + 'suggestions_size': 1, + 'total_count': 1}, + 'nan': { 'suggestion': nan, + 'suggestions': [nan], + 'suggestions_size': 1, + 'total_count': 1}}, + 'last date seen': { '20110410': { 'suggestion': '2011/04/10', + 'suggestions': ['2011/04/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '20120510': { 'suggestion': '2012/05/10', + 'suggestions': ['2012/05/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '20130610': { 'suggestion': '2013/06/10', + 'suggestions': ['2013/06/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '20140710': { 'suggestion': '2014/07/10', + 'suggestions': ['2014/07/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '20150810': { 'suggestion': '2015/08/10', + 'suggestions': ['2015/08/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '20160910': { 'suggestion': '2016/09/10', + 'suggestions': ['2016/09/10'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'last position seen': { '1064270771612534': { 'suggestion': '10.642707,-71.612534', + 'suggestions': [ '10.642707,-71.612534'], + 'suggestions_size': 1, + 'total_count': 1}, + '1944273599201111': { 'suggestion': '19.442735,-99.201111', + 'suggestions': [ '19.442735,-99.201111'], + 'suggestions_size': 1, + 'total_count': 1}, + '33670666117841553': { 'suggestion': '33.670666,-117.841553', + 'suggestions': [ '33.670666,-117.841553'], + 'suggestions_size': 1, + 'total_count': 1}, + '37789563122400356': { 'suggestion': '37.789563,-122.400356', + 'suggestions': [ '37.789563,-122.400356'], + 'suggestions_size': 1, + 'total_count': 1}, + 'none': { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 2}}, + 'names': { 'bumblebee': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], + 'suggestions_size': 1, + 'total_count': 1}, + 'ironhide': { 'suggestion': 'ironhide&', + 'suggestions': ['ironhide&'], + 'suggestions_size': 1, + 'total_count': 1}, + 'jazz': { 'suggestion': 'Jazz', + 'suggestions': ['Jazz'], + 'suggestions_size': 1, + 'total_count': 1}, + 'megatron': { 'suggestion': 'Megatron', + 'suggestions': ['Megatron'], + 'suggestions_size': 1, + 'total_count': 1}, + 'metroplex': { 'suggestion': 'Metroplex_)^$', + 'suggestions': ['Metroplex_)^$'], + 'suggestions_size': 1, + 'total_count': 1}, + 'optimus': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'rank': { '10': { 'suggestion': 10, + 'suggestions': [10], + 'suggestions_size': 1, + 'total_count': 2}, + '7': { 'suggestion': 7, + 'suggestions': [7], + 'suggestions_size': 1, + 'total_count': 2}, + '8': { 'suggestion': 8, + 'suggestions': [8], + 'suggestions_size': 1, + 'total_count': 2}}, + 'timestamp': { '20140624': { 'suggestion': Timestamp('2014-06-24 00:00:00'), + 'suggestions': [ Timestamp('2014-06-24 00:00:00')], + 'suggestions_size': 1, + 'total_count': 6}}, + 'weight(t)': { '18': { 'suggestion': 1.8, + 'suggestions': [1.8], + 'suggestions_size': 1, + 'total_count': 1}, + '20': { 'suggestion': 2.0, + 'suggestions': [2.0], + 'suggestions_size': 1, + 'total_count': 1}, + '40': { 'suggestion': 4.0, + 'suggestions': [4.0], + 'suggestions_size': 1, + 'total_count': 1}, + '43': { 'suggestion': 4.3, + 'suggestions': [4.3], + 'suggestions_size': 1, + 'total_count': 1}, + '57': { 'suggestion': 5.7, + 'suggestions': [5.7], + 'suggestions_size': 1, + 'total_count': 1}, + 'nan': { 'suggestion': nan, + 'suggestions': [nan], + 'suggestions_size': 1, + 'total_count': 1}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_all_levenshtein(self): + df = self.df + result = df.string_clustering(cols='*', algorithm='levenshtein') + expected = { 'weight(t)': { '18': { 'suggestion': '18', + 'suggestions': ['18', '43'], + 'suggestions_size': 2, + 'total_count': 6}, + '20': { 'suggestion': '20', + 'suggestions': ['20', '40'], + 'suggestions_size': 2, + 'total_count': 6}, + '40': { 'suggestion': '40', + 'suggestions': ['40', '43'], + 'suggestions_size': 2, + 'total_count': 6}, + '43': { 'suggestion': '43', + 'suggestions': ['43', '40'], + 'suggestions_size': 2, + 'total_count': 6}, + '57': { 'suggestion': '57', + 'suggestions': ['57', '43'], + 'suggestions_size': 2, + 'total_count': 6}, + 'nan': { 'suggestion': 'nan', + 'suggestions': ['nan', '43'], + 'suggestions_size': 2, + 'total_count': 6}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_all_match_rating_codex(self): + df = self.df + result = df.string_clustering(cols='*', algorithm='match_rating_codex') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_all_metaphone(self): + df = self.df + result = df.string_clustering(cols='*', algorithm='metaphone') + expected = { 'Cybertronian': { 'FLS': { 'suggestion': False, + 'suggestions': [False], + 'suggestions_size': 1, + 'total_count': 1}, + 'TR': { 'suggestion': True, + 'suggestions': [True], + 'suggestions_size': 1, + 'total_count': 5}}, + 'Date Type': { '': { 'suggestion': Timestamp('2016-09-10 00:00:00'), + 'suggestions': [ Timestamp('2016-09-10 00:00:00'), + Timestamp('2015-08-10 00:00:00'), + Timestamp('2014-06-24 00:00:00'), + Timestamp('2013-06-24 00:00:00'), + Timestamp('2012-05-10 00:00:00'), + Timestamp('2011-04-10 00:00:00')], + 'suggestions_size': 6, + 'total_count': 6}}, + 'NullType': { 'NN': { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 6}}, + 'age': { '': { 'suggestion': 5000000, + 'suggestions': [5000000], + 'suggestions_size': 1, + 'total_count': 6}}, + 'date arrival': { '': { 'suggestion': '1980/04/10', + 'suggestions': ['1980/04/10'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'function': { 'BTL STXN': { 'suggestion': 'Battle Station', + 'suggestions': ['Battle Station'], + 'suggestions_size': 1, + 'total_count': 1}, + 'ESPNJ': { 'suggestion': 'Espionage', + 'suggestions': ['Espionage'], + 'suggestions_size': 1, + 'total_count': 1}, + 'FRST LTNNT': { 'suggestion': 'First Lieutenant', + 'suggestions': ['First Lieutenant'], + 'suggestions_size': 1, + 'total_count': 1}, + 'LTR': { 'suggestion': 'Leader', + 'suggestions': ['Leader'], + 'suggestions_size': 1, + 'total_count': 1}, + 'NN': { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 1}, + 'SKRT': { 'suggestion': 'Security', + 'suggestions': ['Security'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'height(ft)': { '': { 'suggestion': -28.0, + 'suggestions': [-28.0, 17.0, 26.0, 13.0, 300.0], + 'suggestions_size': 5, + 'total_count': 5}, + 'NN': { 'suggestion': nan, + 'suggestions': [nan], + 'suggestions_size': 1, + 'total_count': 1}}, + 'last date seen': { '': { 'suggestion': '2016/09/10', + 'suggestions': [ '2016/09/10', + '2015/08/10', + '2014/07/10', + '2013/06/10', + '2012/05/10', + '2011/04/10'], + 'suggestions_size': 6, + 'total_count': 6}}, + 'last position seen': { '': { 'suggestion': '19.442735,-99.201111', + 'suggestions': [ '19.442735,-99.201111', + '10.642707,-71.612534', + '37.789563,-122.400356', + '33.670666,-117.841553'], + 'suggestions_size': 4, + 'total_count': 4}, + 'NN': { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 2}}, + 'names': { 'BMBLB ': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], + 'suggestions_size': 1, + 'total_count': 1}, + 'IRNHT': { 'suggestion': 'ironhide&', + 'suggestions': ['ironhide&'], + 'suggestions_size': 1, + 'total_count': 1}, + 'JS': { 'suggestion': 'Jazz', + 'suggestions': ['Jazz'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MKTRN': { 'suggestion': 'Megatron', + 'suggestions': ['Megatron'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MTRPLKS': { 'suggestion': 'Metroplex_)^$', + 'suggestions': ['Metroplex_)^$'], + 'suggestions_size': 1, + 'total_count': 1}, + 'OPTMS': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'rank': { '': { 'suggestion': 10, + 'suggestions': [10, 7, 8], + 'suggestions_size': 3, + 'total_count': 6}}, + 'timestamp': { '': { 'suggestion': Timestamp('2014-06-24 00:00:00'), + 'suggestions': [Timestamp('2014-06-24 00:00:00')], + 'suggestions_size': 1, + 'total_count': 6}}, + 'weight(t)': { '': { 'suggestion': 4.3, + 'suggestions': [4.3, 2.0, 4.0, 1.8, 5.7], + 'suggestions_size': 5, + 'total_count': 5}, + 'NN': { 'suggestion': nan, + 'suggestions': [nan], + 'suggestions_size': 1, + 'total_count': 1}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_all_ngram_fingerprint(self): + df = self.df + result = df.string_clustering(cols='*', algorithm='ngram_fingerprint') + expected = { 'Cybertronian': { 'alfalsse': { 'suggestion': False, + 'suggestions': [False], + 'suggestions_size': 1, + 'total_count': 1}, + 'rutrue': { 'suggestion': True, + 'suggestions': [True], + 'suggestions_size': 1, + 'total_count': 5}}, + 'Date Type': { '010410112041': { 'suggestion': Timestamp('2011-04-10 00:00:00'), + 'suggestions': [ Timestamp('2011-04-10 00:00:00')], + 'suggestions_size': 1, + 'total_count': 1}, + '010510122051': { 'suggestion': Timestamp('2012-05-10 00:00:00'), + 'suggestions': [ Timestamp('2012-05-10 00:00:00')], + 'suggestions_size': 1, + 'total_count': 1}, + '01061320243062': { 'suggestion': Timestamp('2013-06-24 00:00:00'), + 'suggestions': [ Timestamp('2013-06-24 00:00:00')], + 'suggestions_size': 1, + 'total_count': 1}, + '01061420244062': { 'suggestion': Timestamp('2014-06-24 00:00:00'), + 'suggestions': [ Timestamp('2014-06-24 00:00:00')], + 'suggestions_size': 1, + 'total_count': 1}, + '01081015205081': { 'suggestion': Timestamp('2015-08-10 00:00:00'), + 'suggestions': [ Timestamp('2015-08-10 00:00:00')], + 'suggestions_size': 1, + 'total_count': 1}, + '01091016206091': { 'suggestion': Timestamp('2016-09-10 00:00:00'), + 'suggestions': [ Timestamp('2016-09-10 00:00:00')], + 'suggestions_size': 1, + 'total_count': 1}}, + 'NullType': { 'nenoon': { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 6}}, + 'age': { '0050': { 'suggestion': 5000000, + 'suggestions': [5000000], + 'suggestions_size': 1, + 'total_count': 6}}, + 'date arrival': { '00041019418098': { 'suggestion': '1980/04/10', + 'suggestions': ['1980/04/10'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'function': { 'addeeaerle': { 'suggestion': 'Leader', + 'suggestions': ['Leader'], + 'suggestions_size': 1, + 'total_count': 1}, + 'agesgeionaonpisp': { 'suggestion': 'Espionage', + 'suggestions': ['Espionage'], + 'suggestions_size': 1, + 'total_count': 1}, + 'aneneufiieirlinantrssttetlut': { 'suggestion': 'First ' + 'Lieutenant', + 'suggestions': [ 'First ' + 'Lieutenant'], + 'suggestions_size': 1, + 'total_count': 1}, + 'atbaesioleonsttatitltt': { 'suggestion': 'Battle Station', + 'suggestions': ['Battle Station'], + 'suggestions_size': 1, + 'total_count': 1}, + 'cuecitrisetyur': { 'suggestion': 'Security', + 'suggestions': ['Security'], + 'suggestions_size': 1, + 'total_count': 1}, + 'nenoon': { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 1}}, + 'height(ft)': { '0030': { 'suggestion': 300.0, + 'suggestions': [300.0], + 'suggestions_size': 1, + 'total_count': 1}, + '1330': { 'suggestion': 13.0, + 'suggestions': [13.0], + 'suggestions_size': 1, + 'total_count': 1}, + '1770': { 'suggestion': 17.0, + 'suggestions': [17.0], + 'suggestions_size': 1, + 'total_count': 1}, + '2660': { 'suggestion': 26.0, + 'suggestions': [26.0], + 'suggestions_size': 1, + 'total_count': 1}, + '2880': { 'suggestion': -28.0, + 'suggestions': [-28.0], + 'suggestions_size': 1, + 'total_count': 1}, + 'anna': { 'suggestion': nan, + 'suggestions': [nan], + 'suggestions_size': 1, + 'total_count': 1}}, + 'last date seen': { '010410112041': { 'suggestion': '2011/04/10', + 'suggestions': ['2011/04/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '010510122051': { 'suggestion': '2012/05/10', + 'suggestions': ['2012/05/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '01061013203061': { 'suggestion': '2013/06/10', + 'suggestions': ['2013/06/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '01071014204071': { 'suggestion': '2014/07/10', + 'suggestions': ['2014/07/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '01081015205081': { 'suggestion': '2015/08/10', + 'suggestions': ['2015/08/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '01091016206091': { 'suggestion': '2016/09/10', + 'suggestions': ['2016/09/10'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'last position seen': { '000312222431353740566377788995': { 'suggestion': '37.789563,-122.400356', + 'suggestions': [ '37.789563,-122.400356'], + 'suggestions_size': 1, + 'total_count': 1}, + '01111920273542445973929499': { 'suggestion': '19.442735,-99.201111', + 'suggestions': [ '19.442735,-99.201111'], + 'suggestions_size': 1, + 'total_count': 1}, + '060710121625273442536164707177': { 'suggestion': '10.642707,-71.612534', + 'suggestions': [ '10.642707,-71.612534'], + 'suggestions_size': 1, + 'total_count': 1}, + '061115173336415355616667707884': { 'suggestion': '33.670666,-117.841553', + 'suggestions': [ '33.670666,-117.841553'], + 'suggestions_size': 1, + 'total_count': 1}, + 'nenoon': { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 2}}, + 'names': { 'ateggameonrotr': { 'suggestion': 'Megatron', + 'suggestions': ['Megatron'], + 'suggestions_size': 1, + 'total_count': 1}, + 'azjazz': { 'suggestion': 'Jazz', + 'suggestions': ['Jazz'], + 'suggestions_size': 1, + 'total_count': 1}, + 'beblbuebeelembum': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], + 'suggestions_size': 1, + 'total_count': 1}, + 'dehiidirnhonro': { 'suggestion': 'ironhide&', + 'suggestions': ['ironhide&'], + 'suggestions_size': 1, + 'total_count': 1}, + 'etexlemeopplrotr': { 'suggestion': 'Metroplex_)^$', + 'suggestions': ['Metroplex_)^$'], + 'suggestions_size': 1, + 'total_count': 1}, + 'immuoppttius': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'rank': { '': { 'suggestion': 7, + 'suggestions': [7, 8], + 'suggestions_size': 2, + 'total_count': 4}, + '10': { 'suggestion': 10, + 'suggestions': [10], + 'suggestions_size': 1, + 'total_count': 2}}, + 'timestamp': { '01061420244062': { 'suggestion': Timestamp('2014-06-24 00:00:00'), + 'suggestions': [ Timestamp('2014-06-24 00:00:00')], + 'suggestions_size': 1, + 'total_count': 6}}, + 'weight(t)': { '18': { 'suggestion': 1.8, + 'suggestions': [1.8], + 'suggestions_size': 1, + 'total_count': 1}, + '20': { 'suggestion': 2.0, + 'suggestions': [2.0], + 'suggestions_size': 1, + 'total_count': 1}, + '40': { 'suggestion': 4.0, + 'suggestions': [4.0], + 'suggestions_size': 1, + 'total_count': 1}, + '43': { 'suggestion': 4.3, + 'suggestions': [4.3], + 'suggestions_size': 1, + 'total_count': 1}, + '57': { 'suggestion': 5.7, + 'suggestions': [5.7], + 'suggestions_size': 1, + 'total_count': 1}, + 'anna': { 'suggestion': nan, + 'suggestions': [nan], + 'suggestions_size': 1, + 'total_count': 1}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_all_nysiis(self): + df = self.df + result = df.string_clustering(cols='*', algorithm='nysiis') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_all_soundex(self): + df = self.df + result = df.string_clustering(cols='*', algorithm='soundex') + expected = { 'Cybertronian': { 'F420': { 'suggestion': False, + 'suggestions': [False], + 'suggestions_size': 1, + 'total_count': 1}, + 'T600': { 'suggestion': True, + 'suggestions': [True], + 'suggestions_size': 1, + 'total_count': 5}}, + 'Date Type': { '2000': { 'suggestion': Timestamp('2016-09-10 00:00:00'), + 'suggestions': [ Timestamp('2016-09-10 00:00:00'), + Timestamp('2015-08-10 00:00:00'), + Timestamp('2014-06-24 00:00:00'), + Timestamp('2013-06-24 00:00:00'), + Timestamp('2012-05-10 00:00:00'), + Timestamp('2011-04-10 00:00:00')], + 'suggestions_size': 6, + 'total_count': 6}}, + 'NullType': { 'N500': { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 6}}, + 'age': { '5000': { 'suggestion': 5000000, + 'suggestions': [5000000], + 'suggestions_size': 1, + 'total_count': 6}}, + 'date arrival': { '1000': { 'suggestion': '1980/04/10', + 'suggestions': ['1980/04/10'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'function': { 'B342': { 'suggestion': 'Battle Station', + 'suggestions': ['Battle Station'], + 'suggestions_size': 1, + 'total_count': 1}, + 'E215': { 'suggestion': 'Espionage', + 'suggestions': ['Espionage'], + 'suggestions_size': 1, + 'total_count': 1}, + 'F623': { 'suggestion': 'First Lieutenant', + 'suggestions': ['First Lieutenant'], + 'suggestions_size': 1, + 'total_count': 1}, + 'L360': { 'suggestion': 'Leader', + 'suggestions': ['Leader'], + 'suggestions_size': 1, + 'total_count': 1}, + 'N500': { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 1}, + 'S263': { 'suggestion': 'Security', + 'suggestions': ['Security'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'height(ft)': { '-000': { 'suggestion': -28.0, + 'suggestions': [-28.0], + 'suggestions_size': 1, + 'total_count': 1}, + '1000': { 'suggestion': 17.0, + 'suggestions': [17.0, 13.0], + 'suggestions_size': 2, + 'total_count': 2}, + '2000': { 'suggestion': 26.0, + 'suggestions': [26.0], + 'suggestions_size': 1, + 'total_count': 1}, + '3000': { 'suggestion': 300.0, + 'suggestions': [300.0], + 'suggestions_size': 1, + 'total_count': 1}, + 'N500': { 'suggestion': nan, + 'suggestions': [nan], + 'suggestions_size': 1, + 'total_count': 1}}, + 'last date seen': { '2000': { 'suggestion': '2016/09/10', + 'suggestions': [ '2016/09/10', + '2015/08/10', + '2014/07/10', + '2013/06/10', + '2012/05/10', + '2011/04/10'], + 'suggestions_size': 6, + 'total_count': 6}}, + 'last position seen': { '1000': { 'suggestion': '19.442735,-99.201111', + 'suggestions': [ '19.442735,-99.201111', + '10.642707,-71.612534'], + 'suggestions_size': 2, + 'total_count': 2}, + '3000': { 'suggestion': '37.789563,-122.400356', + 'suggestions': [ '37.789563,-122.400356', + '33.670666,-117.841553'], + 'suggestions_size': 2, + 'total_count': 2}, + 'N500': { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 2}}, + 'names': { 'B514': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], + 'suggestions_size': 1, + 'total_count': 1}, + 'I653': { 'suggestion': 'ironhide&', + 'suggestions': ['ironhide&'], + 'suggestions_size': 1, + 'total_count': 1}, + 'J200': { 'suggestion': 'Jazz', + 'suggestions': ['Jazz'], + 'suggestions_size': 1, + 'total_count': 1}, + 'M236': { 'suggestion': 'Megatron', + 'suggestions': ['Megatron'], + 'suggestions_size': 1, + 'total_count': 1}, + 'M361': { 'suggestion': 'Metroplex_)^$', + 'suggestions': ['Metroplex_)^$'], + 'suggestions_size': 1, + 'total_count': 1}, + 'O135': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'rank': { '1000': { 'suggestion': 10, + 'suggestions': [10], + 'suggestions_size': 1, + 'total_count': 2}, + '7000': { 'suggestion': 7, + 'suggestions': [7], + 'suggestions_size': 1, + 'total_count': 2}, + '8000': { 'suggestion': 8, + 'suggestions': [8], + 'suggestions_size': 1, + 'total_count': 2}}, + 'timestamp': { '2000': { 'suggestion': Timestamp('2014-06-24 00:00:00'), + 'suggestions': [Timestamp('2014-06-24 00:00:00')], + 'suggestions_size': 1, + 'total_count': 6}}, + 'weight(t)': { '1000': { 'suggestion': 1.8, + 'suggestions': [1.8], + 'suggestions_size': 1, + 'total_count': 1}, + '2000': { 'suggestion': 2.0, + 'suggestions': [2.0], + 'suggestions_size': 1, + 'total_count': 1}, + '4000': { 'suggestion': 4.3, + 'suggestions': [4.3, 4.0], + 'suggestions_size': 2, + 'total_count': 2}, + '5000': { 'suggestion': 5.7, + 'suggestions': [5.7], + 'suggestions_size': 1, + 'total_count': 1}, + 'N500': { 'suggestion': nan, + 'suggestions': [nan], + 'suggestions_size': 1, + 'total_count': 1}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_multiple_double_metaphone(self): + df = self.df + result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='double_metaphone') + expected = { 'Cybertronian': { ('FLS', ''): { 'suggestion': False, + 'suggestions': [False], + 'suggestions_size': 1, + 'total_count': 1}, + ('TR', ''): { 'suggestion': True, + 'suggestions': [True], + 'suggestions_size': 1, + 'total_count': 5}}, + 'NullType': { ('NN', ''): { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 6}}, + 'timestamp': { ('', ''): { 'suggestion': Timestamp('2014-06-24 00:00:00'), + 'suggestions': [Timestamp('2014-06-24 00:00:00')], + 'suggestions_size': 1, + 'total_count': 6}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_multiple_fingerprint(self): + df = self.df + result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='fingerprint') + expected = { 'Cybertronian': { 'false': { 'suggestion': False, + 'suggestions': [False], + 'suggestions_size': 1, + 'total_count': 1}, + 'true': { 'suggestion': True, + 'suggestions': [True], + 'suggestions_size': 1, + 'total_count': 5}}, + 'NullType': { 'none': { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 6}}, + 'timestamp': { '20140624': { 'suggestion': Timestamp('2014-06-24 00:00:00'), + 'suggestions': [ Timestamp('2014-06-24 00:00:00')], + 'suggestions_size': 1, + 'total_count': 6}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_multiple_levenshtein(self): + df = self.df + result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='levenshtein') + expected = { 'timestamp': { '20140624': { 'suggestion': '20140624', + 'suggestions': ['20140624'], + 'suggestions_size': 1, + 'total_count': 6}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_multiple_match_rating_codex(self): + df = self.df + result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='match_rating_codex') + expected = { 'Cybertronian': { 'FLS': { 'suggestion': False, + 'suggestions': [False], + 'suggestions_size': 1, + 'total_count': 1}, + 'TR': { 'suggestion': True, + 'suggestions': [True], + 'suggestions_size': 1, + 'total_count': 5}}, + 'NullType': { 'N': { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 6}}, + 'timestamp': { '201-24': { 'suggestion': Timestamp('2014-06-24 00:00:00'), + 'suggestions': [Timestamp('2014-06-24 00:00:00')], + 'suggestions_size': 1, + 'total_count': 6}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_multiple_metaphone(self): + df = self.df + result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='metaphone') + expected = { 'Cybertronian': { 'FLS': { 'suggestion': False, + 'suggestions': [False], + 'suggestions_size': 1, + 'total_count': 1}, + 'TR': { 'suggestion': True, + 'suggestions': [True], + 'suggestions_size': 1, + 'total_count': 5}}, + 'NullType': { 'NN': { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 6}}, + 'timestamp': { '': { 'suggestion': Timestamp('2014-06-24 00:00:00'), + 'suggestions': [Timestamp('2014-06-24 00:00:00')], + 'suggestions_size': 1, + 'total_count': 6}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_multiple_ngram_fingerprint(self): + df = self.df + result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='ngram_fingerprint') + expected = { 'Cybertronian': { 'alfalsse': { 'suggestion': False, + 'suggestions': [False], + 'suggestions_size': 1, + 'total_count': 1}, + 'rutrue': { 'suggestion': True, + 'suggestions': [True], + 'suggestions_size': 1, + 'total_count': 5}}, + 'NullType': { 'nenoon': { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 6}}, + 'timestamp': { '01061420244062': { 'suggestion': Timestamp('2014-06-24 00:00:00'), + 'suggestions': [ Timestamp('2014-06-24 00:00:00')], + 'suggestions_size': 1, + 'total_count': 6}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_multiple_nysiis(self): + df = self.df + result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='nysiis') + expected = { 'Cybertronian': { 'FALS': { 'suggestion': False, + 'suggestions': [False], + 'suggestions_size': 1, + 'total_count': 1}, + 'TR': { 'suggestion': True, + 'suggestions': [True], + 'suggestions_size': 1, + 'total_count': 5}}, + 'NullType': { 'NAN': { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 6}}, + 'timestamp': { '2014-06-24': { 'suggestion': Timestamp('2014-06-24 00:00:00'), + 'suggestions': [ Timestamp('2014-06-24 00:00:00')], + 'suggestions_size': 1, + 'total_count': 6}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_multiple_soundex(self): + df = self.df + result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='soundex') + expected = { 'Cybertronian': { 'F420': { 'suggestion': False, + 'suggestions': [False], + 'suggestions_size': 1, + 'total_count': 1}, + 'T600': { 'suggestion': True, + 'suggestions': [True], + 'suggestions_size': 1, + 'total_count': 5}}, + 'NullType': { 'N500': { 'suggestion': None, + 'suggestions': [None], + 'suggestions_size': 1, + 'total_count': 6}}, + 'timestamp': { '2000': { 'suggestion': Timestamp('2014-06-24 00:00:00'), + 'suggestions': [Timestamp('2014-06-24 00:00:00')], + 'suggestions_size': 1, + 'total_count': 6}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_numeric_double_metaphone(self): + df = self.df + result = df.string_clustering(cols=['rank'], algorithm='double_metaphone') + expected = { 'rank': { ('', ''): { 'suggestion': 10, + 'suggestions': [10, 7, 8], + 'suggestions_size': 3, + 'total_count': 6}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_numeric_fingerprint(self): + df = self.df + result = df.string_clustering(cols=['rank'], algorithm='fingerprint') + expected = { 'rank': { '10': { 'suggestion': 10, + 'suggestions': [10], + 'suggestions_size': 1, + 'total_count': 2}, + '7': { 'suggestion': 7, + 'suggestions': [7], + 'suggestions_size': 1, + 'total_count': 2}, + '8': { 'suggestion': 8, + 'suggestions': [8], + 'suggestions_size': 1, + 'total_count': 2}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_numeric_levenshtein(self): + df = self.df + result = df.string_clustering(cols=['rank'], algorithm='levenshtein') + expected = { 'rank': { '10': { 'suggestion': '10', + 'suggestions': ['10'], + 'suggestions_size': 1, + 'total_count': 6}, + '7': { 'suggestion': '7', + 'suggestions': ['7'], + 'suggestions_size': 1, + 'total_count': 6}, + '8': { 'suggestion': '8', + 'suggestions': ['8'], + 'suggestions_size': 1, + 'total_count': 6}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_numeric_match_rating_codex(self): + df = self.df + result = df.string_clustering(cols=['rank'], algorithm='match_rating_codex') + expected = { 'rank': { '10': { 'suggestion': 10, + 'suggestions': [10], + 'suggestions_size': 1, + 'total_count': 2}, + '7': { 'suggestion': 7, + 'suggestions': [7], + 'suggestions_size': 1, + 'total_count': 2}, + '8': { 'suggestion': 8, + 'suggestions': [8], + 'suggestions_size': 1, + 'total_count': 2}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_numeric_metaphone(self): + df = self.df + result = df.string_clustering(cols=['rank'], algorithm='metaphone') + expected = { 'rank': { '': { 'suggestion': 10, + 'suggestions': [10, 7, 8], + 'suggestions_size': 3, + 'total_count': 6}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_numeric_ngram_fingerprint(self): + df = self.df + result = df.string_clustering(cols=['rank'], algorithm='ngram_fingerprint') + expected = { 'rank': { '': { 'suggestion': 7, + 'suggestions': [7, 8], + 'suggestions_size': 2, + 'total_count': 4}, + '10': { 'suggestion': 10, + 'suggestions': [10], + 'suggestions_size': 1, + 'total_count': 2}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_numeric_nysiis(self): + df = self.df + result = df.string_clustering(cols=['rank'], algorithm='nysiis') + expected = { 'rank': { '10': { 'suggestion': 10, + 'suggestions': [10], + 'suggestions_size': 1, + 'total_count': 2}, + '7': { 'suggestion': 7, + 'suggestions': [7], + 'suggestions_size': 1, + 'total_count': 2}, + '8': { 'suggestion': 8, + 'suggestions': [8], + 'suggestions_size': 1, + 'total_count': 2}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_numeric_soundex(self): + df = self.df + result = df.string_clustering(cols=['rank'], algorithm='soundex') + expected = { 'rank': { '1000': { 'suggestion': 10, + 'suggestions': [10], + 'suggestions_size': 1, + 'total_count': 2}, + '7000': { 'suggestion': 7, + 'suggestions': [7], + 'suggestions_size': 1, + 'total_count': 2}, + '8000': { 'suggestion': 8, + 'suggestions': [8], + 'suggestions_size': 1, + 'total_count': 2}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_string_double_metaphone(self): + df = self.df + result = df.string_clustering(cols=['names'], algorithm='double_metaphone') + expected = { 'names': { ('APTMS', ''): { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}, + ('ARNT', ''): { 'suggestion': 'ironhide&', + 'suggestions': ['ironhide&'], + 'suggestions_size': 1, + 'total_count': 1}, + ('JS', 'AS'): { 'suggestion': 'Jazz', + 'suggestions': ['Jazz'], + 'suggestions_size': 1, + 'total_count': 1}, + ('MKTRN', ''): { 'suggestion': 'Megatron', + 'suggestions': ['Megatron'], + 'suggestions_size': 1, + 'total_count': 1}, + ('MTRPLKSKSKSKSKS', ''): { 'suggestion': 'Metroplex_)^$', + 'suggestions': ['Metroplex_)^$'], + 'suggestions_size': 1, + 'total_count': 1}, + ('PMPLLP', ''): { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], + 'suggestions_size': 1, + 'total_count': 1}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_string_fingerprint(self): + df = self.df + result = df.string_clustering(cols=['names'], algorithm='fingerprint') + expected = { 'names': { 'bumblebee': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], + 'suggestions_size': 1, + 'total_count': 1}, + 'ironhide': { 'suggestion': 'ironhide&', + 'suggestions': ['ironhide&'], + 'suggestions_size': 1, + 'total_count': 1}, + 'jazz': { 'suggestion': 'Jazz', + 'suggestions': ['Jazz'], + 'suggestions_size': 1, + 'total_count': 1}, + 'megatron': { 'suggestion': 'Megatron', + 'suggestions': ['Megatron'], + 'suggestions_size': 1, + 'total_count': 1}, + 'metroplex': { 'suggestion': 'Metroplex_)^$', + 'suggestions': ['Metroplex_)^$'], + 'suggestions_size': 1, + 'total_count': 1}, + 'optimus': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_string_levenshtein(self): + df = self.df + result = df.string_clustering(cols=['names'], algorithm='levenshtein') + expected = { 'names': { 'bumblebee': { 'suggestion': 'bumblebee', + 'suggestions': ['bumblebee', 'ironhide'], + 'suggestions_size': 2, + 'total_count': 6}, + 'ironhide': { 'suggestion': 'ironhide', + 'suggestions': ['ironhide', 'optimus'], + 'suggestions_size': 2, + 'total_count': 6}, + 'jazz': { 'suggestion': 'jazz', + 'suggestions': ['jazz', 'optimus'], + 'suggestions_size': 2, + 'total_count': 6}, + 'megatron': { 'suggestion': 'megatron', + 'suggestions': ['megatron', 'metroplex'], + 'suggestions_size': 2, + 'total_count': 6}, + 'metroplex': { 'suggestion': 'metroplex', + 'suggestions': ['metroplex', 'megatron'], + 'suggestions_size': 2, + 'total_count': 6}, + 'optimus': { 'suggestion': 'optimus', + 'suggestions': ['optimus', 'ironhide'], + 'suggestions_size': 2, + 'total_count': 6}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_string_match_rating_codex(self): + df = self.df + result = df.string_clustering(cols=['names'], algorithm='match_rating_codex') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_string_metaphone(self): + df = self.df + result = df.string_clustering(cols=['names'], algorithm='metaphone') + expected = { 'names': { 'BMBLB ': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], + 'suggestions_size': 1, + 'total_count': 1}, + 'IRNHT': { 'suggestion': 'ironhide&', + 'suggestions': ['ironhide&'], + 'suggestions_size': 1, + 'total_count': 1}, + 'JS': { 'suggestion': 'Jazz', + 'suggestions': ['Jazz'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MKTRN': { 'suggestion': 'Megatron', + 'suggestions': ['Megatron'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MTRPLKS': { 'suggestion': 'Metroplex_)^$', + 'suggestions': ['Metroplex_)^$'], + 'suggestions_size': 1, + 'total_count': 1}, + 'OPTMS': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_string_ngram_fingerprint(self): + df = self.df + result = df.string_clustering(cols=['names'], algorithm='ngram_fingerprint') + expected = { 'names': { 'ateggameonrotr': { 'suggestion': 'Megatron', + 'suggestions': ['Megatron'], + 'suggestions_size': 1, + 'total_count': 1}, + 'azjazz': { 'suggestion': 'Jazz', + 'suggestions': ['Jazz'], + 'suggestions_size': 1, + 'total_count': 1}, + 'beblbuebeelembum': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], + 'suggestions_size': 1, + 'total_count': 1}, + 'dehiidirnhonro': { 'suggestion': 'ironhide&', + 'suggestions': ['ironhide&'], + 'suggestions_size': 1, + 'total_count': 1}, + 'etexlemeopplrotr': { 'suggestion': 'Metroplex_)^$', + 'suggestions': ['Metroplex_)^$'], + 'suggestions_size': 1, + 'total_count': 1}, + 'immuoppttius': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + def test_string_clustering_string_nysiis(self): + df = self.df + result = df.string_clustering(cols=['names'], algorithm='nysiis') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_string_soundex(self): + df = self.df + result = df.string_clustering(cols=['names'], algorithm='soundex') + expected = { 'names': { 'B514': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], + 'suggestions_size': 1, + 'total_count': 1}, + 'I653': { 'suggestion': 'ironhide&', + 'suggestions': ['ironhide&'], + 'suggestions_size': 1, + 'total_count': 1}, + 'J200': { 'suggestion': 'Jazz', + 'suggestions': ['Jazz'], + 'suggestions_size': 1, + 'total_count': 1}, + 'M236': { 'suggestion': 'Megatron', + 'suggestions': ['Megatron'], + 'suggestions_size': 1, + 'total_count': 1}, + 'M361': { 'suggestion': 'Metroplex_)^$', + 'suggestions': ['Metroplex_)^$'], + 'suggestions_size': 1, + 'total_count': 1}, + 'O135': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + + +class TestStringclusteringDask(TestStringclusteringPandas): + config = {'engine': 'dask', 'n_partitions': 1} + + +class TestStringclusteringPartitionDask(TestStringclusteringPandas): + config = {'engine': 'dask', 'n_partitions': 2} + + +try: + import cudf # pyright: reportMissingImports=false +except: + pass +else: + class TestStringclusteringCUDF(TestStringclusteringPandas): + config = {'engine': 'cudf'} + + +try: + import dask_cudf # pyright: reportMissingImports=false +except: + pass +else: + class TestStringclusteringDC(TestStringclusteringPandas): + config = {'engine': 'dask_cudf', 'n_partitions': 1} + + +try: + import dask_cudf # pyright: reportMissingImports=false +except: + pass +else: + class TestStringclusteringPartitionDC(TestStringclusteringPandas): + config = {'engine': 'dask_cudf', 'n_partitions': 2} + + +try: + import pyspark # pyright: reportMissingImports=false +except: + pass +else: + class TestStringclusteringSpark(TestStringclusteringPandas): + config = {'engine': 'spark'} + + +try: + import vaex # pyright: reportMissingImports=false +except: + pass +else: + class TestStringclusteringVaex(TestStringclusteringPandas): + config = {'engine': 'vaex'} From 61fc3797ba6e5e39ce8d77b6144d3743b4fd01e4 Mon Sep 17 00:00:00 2001 From: Jose Angel Hernao Date: Mon, 6 Dec 2021 16:48:59 -0600 Subject: [PATCH 2/2] Add string clustering tests --- tests/creators/creator_stringclustering.py | 52 +- tests/test_created__stringclustering.py | 1843 ++++++++++---------- 2 files changed, 939 insertions(+), 956 deletions(-) diff --git a/tests/creators/creator_stringclustering.py b/tests/creators/creator_stringclustering.py index 8a63d1f66..45430904b 100644 --- a/tests/creators/creator_stringclustering.py +++ b/tests/creators/creator_stringclustering.py @@ -19,10 +19,10 @@ def create(): ('Date Type'): [datetime.datetime(2016, 9, 10), datetime.datetime(2015, 8, 10), datetime.datetime(2014, 6, 24), datetime.datetime(2013, 6, 24), datetime.datetime(2012, 5, 10), datetime.datetime(2011, 4, 10)], ('age', 'int'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'string'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], - ('names', 'str'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], + ('names', 'str'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'time'): [datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0), datetime.datetime(2014, 6, 24, 0, 0)], ('weight(t)', 'float'): [4.3, 2.0, 4.0, 1.8, 5.7, None] - }) + }) t = TestCreator(op, df, name="stringclustering", configs=default_configs) @@ -62,6 +62,54 @@ def create(): t.create(method="string_clustering", variant="multiple_soundex", cols=["NullType","Cybertronian","timestamp"], algorithm="soundex") t.create(method="string_clustering", variant="multiple_levenshtein", cols=["NullType","Cybertronian","timestamp"], algorithm="levenshtein") + t.create(method="cols.fingerprint", variant="all", cols="*") + t.create(method="cols.fingerprint", variant="string", cols=["names"]) + t.create(method="cols.fingerprint", variant="numeric", cols=["rank"], output_cols=["rk"]) + t.create(method="cols.fingerprint", variant="multiple", cols=["NullType","Cybertronian","timestamp"], output_cols=["nt","ct","ts"]) + + t.create(method="cols.pos", variant="single", cols=["names"]) + t.create(method="cols.pos", variant="multiple", cols=["date arrival","japanese name","last date seen"], output_cols=["da","jn","lds"]) + + t.create(method="cols.ngrams", variant="single", cols=["names"]) + t.create(method="cols.ngrams", variant="multiple", cols=["date arrival","japanese name","last date seen"], n_size=1, output_cols=["da","jn","lds"]) + + t.create(method="cols.ngram_fingerprint", variant="all", cols="*") + t.create(method="cols.ngram_fingerprint", variant="string", cols=["function(binary)"], n_size=25) + t.create(method="cols.ngram_fingerprint", variant="numeric", cols=["rank"], output_cols=["rk"]) + t.create(method="cols.ngram_fingerprint", variant="multiple", cols=["NullType","Cybertronian","timestamp"], n_size=4, output_cols=["nt","ct","ts"]) + + t.create(method="cols.metaphone", variant="all", cols="*") + t.create(method="cols.metaphone", variant="string", cols=["names"]) + t.create(method="cols.metaphone", variant="numeric", cols=["rank"], output_cols=["rk"]) + t.create(method="cols.metaphone", variant="multiple", cols=["NullType","Cybertronian","timestamp"], output_cols=["nt","ct","ts"]) + + t.create(method="cols.nysiis", variant="all", cols="*") + t.create(method="cols.nysiis", variant="string", cols=["names"]) + t.create(method="cols.nysiis", variant="numeric", cols=["rank"], output_cols=["rk"]) + t.create(method="cols.nysiis", variant="multiple", cols=["NullType","Cybertronian","timestamp"], output_cols=["nt","ct","ts"]) + + t.create(method="cols.match_rating_codex", variant="all", cols="*") + t.create(method="cols.match_rating_codex", variant="string", cols=["names"]) + t.create(method="cols.match_rating_codex", variant="numeric", cols=["rank"], output_cols=["rk"]) + t.create(method="cols.match_rating_codex", variant="multiple", cols=["NullType","Cybertronian","timestamp"], output_cols=["nt","ct","ts"]) + + t.create(method="cols.double_metaphone", variant="all", cols="*") + t.create(method="cols.double_metaphone", variant="string", cols=["names"]) + t.create(method="cols.double_metaphone", variant="numeric", cols=["rank"], output_cols=["rk"]) + t.create(method="cols.double_metaphone", variant="multiple", cols=["NullType","Cybertronian","timestamp"], output_cols=["nt","ct","ts"]) + + t.create(method="cols.soundex", variant="all", cols="*") + t.create(method="cols.soundex", variant="string", cols=["names"]) + t.create(method="cols.soundex", variant="numeric", cols=["rank"], output_cols=["rk"]) + t.create(method="cols.soundex", variant="multiple", cols=["NullType","Cybertronian","timestamp"], output_cols=["nt","ct","ts"]) + + t.create(method="cols.levenshtein", variant="all_value", cols="*", value=["1a#-s","ERR","d2e","0","[]","''","","1","lu","2016","5000000","aeiou","abc#&^","2014-06-23","nan."]) + t.create(method="cols.levenshtein", variant="all_col", cols="*", other_cols=['date arrival','weight(t)','age','height(ft)','japanese name','rank','last date seen','names','last position seen','Cybertronian','NullType','Date Type','function(binary)','function','timestamp']) + t.create(method="cols.levenshtein", variant="single_value", cols=["names"], value="prime", output_cols="nms") + t.create(method="cols.levenshtein", variant="single_col", cols=["rank"], other_cols=["weight(t)"]) + t.create(method="cols.levenshtein", variant="multiple_value", cols=["last position seen","age","japanese name"], value=["10005","000","['Bumble']"]) + t.create(method="cols.levenshtein", variant="multiple_col", cols=["NullType","Cybertronian","timestamp"], other_cols=["height(ft)","function","Date Type"], output_cols=["nt-ht","ct-ft","ts-dt"]) + t.run() create() \ No newline at end of file diff --git a/tests/test_created__stringclustering.py b/tests/test_created__stringclustering.py index 4542e89a6..f48a5093f 100644 --- a/tests/test_created__stringclustering.py +++ b/tests/test_created__stringclustering.py @@ -1,4 +1,5 @@ import datetime +import numpy as np from optimus.tests.base import TestBase from optimus.helpers.json import json_encoding from optimus.helpers.functions import deep_sort, df_dicts_equal, results_equal @@ -8,408 +9,480 @@ def Timestamp(t): return datetime.datetime.strptime(t, "%Y-%m-%d %H:%M:%S") +NaT = np.datetime64('NaT') nan = float("nan") inf = float("inf") class TestStringclusteringPandas(TestBase): config = {'engine': 'pandas'} - dict = {('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'ironhide&', 'Jazz', 'Megatron', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]} + dict = {('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]} maxDiff = None + def test_cols_double_metaphone_all(self): + df = self.df.copy() + result = df.cols.double_metaphone(cols='*') + expected = self.create_dataframe(data={('NullType', 'object'): [('NN', ''), ('NN', ''), ('NN', ''), ('NN', ''), ('NN', ''), ('NN', '')], ('date arrival', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('', ''), ('', '')], ('height(ft)', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('NN', ''), ('', '')], ('last date seen', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('', ''), ('', '')], ('last position seen', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('NN', ''), ('NN', '')], ('rank', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('', ''), ('', '')], ('Cybertronian', 'object'): [('TR', ''), ('TR', ''), ('TR', ''), ('TR', ''), ('TR', ''), ('FLS', '')], ('Date Type', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('', ''), ('', '')], ('age', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('', ''), ('', '')], ('function', 'object'): [('LTR', ''), ('ASPNJ', 'ASPNK'), ('SKRT', ''), ('FRSTLTNNT', ''), ('NN', ''), ('PTLSTXN', '')], ('names', 'object'): [('APTMS', ''), ('PMPLLP', ''), ('MTRPLKS', ''), ('PMPLP', ''), ('MTRPPLKS', ''), ('MTRPLKSKSKSKSKS', '')], ('timestamp', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('', ''), ('', '')], ('weight(t)', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('', ''), ('NN', '')]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_double_metaphone_multiple(self): + df = self.df.copy() + result = df.cols.double_metaphone(cols=['NullType', 'Cybertronian', 'timestamp'], output_cols=['nt', 'ct', 'ts']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('nt', 'object'): [('NN', ''), ('NN', ''), ('NN', ''), ('NN', ''), ('NN', ''), ('NN', '')], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('ct', 'object'): [('TR', ''), ('TR', ''), ('TR', ''), ('TR', ''), ('TR', ''), ('FLS', '')], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('ts', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('', ''), ('', '')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_double_metaphone_numeric(self): + df = self.df.copy() + result = df.cols.double_metaphone(cols=['rank'], output_cols=['rk']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('rk', 'object'): [('', ''), ('', ''), ('', ''), ('', ''), ('', ''), ('', '')], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_double_metaphone_string(self): + df = self.df.copy() + result = df.cols.double_metaphone(cols=['names']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): [('APTMS', ''), ('PMPLLP', ''), ('MTRPLKS', ''), ('PMPLP', ''), ('MTRPPLKS', ''), ('MTRPLKSKSKSKSKS', '')], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_fingerprint_all(self): + df = self.df.copy() + result = df.cols.fingerprint(cols='*') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_fingerprint_multiple(self): + df = self.df.copy() + result = df.cols.fingerprint(cols=['NullType', 'Cybertronian', 'timestamp'], output_cols=['nt', 'ct', 'ts']) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_fingerprint_numeric(self): + df = self.df.copy() + result = df.cols.fingerprint(cols=['rank'], output_cols=['rk']) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_fingerprint_string(self): + df = self.df.copy() + result = df.cols.fingerprint(cols=['names']) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_levenshtein_all_col(self): + df = self.df.copy() + result = df.cols.levenshtein(cols='*', other_cols=['date arrival', 'weight(t)', 'age', 'height(ft)', 'japanese name', 'rank', 'last date seen', 'names', 'last position seen', 'Cybertronian', 'NullType', 'Date Type', 'function(binary)', 'function', 'timestamp']) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_levenshtein_all_value(self): + df = self.df.copy() + result = df.cols.levenshtein(cols='*', value=['1a#-s', 'ERR', 'd2e', '0', '[]', "''", '', '1', 'lu', '2016', '5000000', 'aeiou', 'abc#&^', '2014-06-23', 'nan.']) + expected = self.create_dataframe(data={('NullType', 'int64'): [5, 5, 5, 5, 5, 5], ('date arrival', 'int64'): [10, 10, 10, 10, 10, 10], ('height(ft)', 'int64'): [4, 4, 4, 4, 3, 5], ('last date seen', 'int64'): [9, 9, 9, 9, 9, 9], ('last position seen', 'int64'): [20, 20, 21, 21, 4, 4], ('rank', 'int64'): [2, 2, 2, 2, 2, 2], ('Cybertronian', 'int64'): [4, 4, 4, 4, 4, 5], ('Date Type', 'int64'): [9, 9, 9, 9, 9, 9], ('age', 'int64'): [7, 7, 7, 7, 7, 7], ('function', 'int64'): [6, 9, 8, 16, 4, 14], ('names', 'int64'): [7, 12, 9, 9, 11, 13], ('timestamp', 'int64'): [10, 10, 10, 10, 10, 10], ('weight(t)', 'int64'): [6, 6, 6, 6, 6, 6]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_levenshtein_multiple_col(self): + df = self.df.copy() + result = df.cols.levenshtein(cols=['NullType', 'Cybertronian', 'timestamp'], other_cols=['height(ft)', 'function', 'Date Type'], output_cols=['nt-ht', 'ct-ft', 'ts-dt']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ("NullType_['nt-ht', 'ct-ft', 'ts-dt']", 'int64'): [5, 4, 4, 4, 3, 5], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ("Cybertronian_['nt-ht', 'ct-ft', 'ts-dt']", 'int64'): [5, 8, 7, 13, 3, 12], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ("timestamp_['nt-ht', 'ct-ft', 'ts-dt']", 'int64'): [4, 4, 0, 1, 4, 4], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_levenshtein_multiple_value(self): + df = self.df.copy() + result = df.cols.levenshtein(cols=['last position seen', 'age', 'japanese name'], value=['10005', '000', "['Bumble']"]) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_levenshtein_single_col(self): + df = self.df.copy() + result = df.cols.levenshtein(cols=['rank'], other_cols=['weight(t)']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [3, 3, 3, 2, 3, 3], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_levenshtein_single_value(self): + df = self.df.copy() + result = df.cols.levenshtein(cols=['names'], value='prime', output_cols='nms') + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('nms', 'int64'): [4, 11, 7, 8, 9, 11], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_match_rating_codex_all(self): + df = self.df.copy() + result = df.cols.match_rating_codex(cols='*') + expected = self.create_dataframe(data={('NullType', 'object'): ['N', 'N', 'N', 'N', 'N', 'N'], ('date arrival', 'object'): ['198/10', '198/10', '198/10', '198/10', '198/10', '198/10'], ('height(ft)', 'object'): ['-28.0', '17.0', '26.0', '13.0', 'N', '30.0'], ('last date seen', 'object'): ['201/10', '201/10', '201/10', '201/10', '201/10', '201/10'], ('last position seen', 'object'): ['19.201', '10.534', '37.356', '3.6153', 'N', 'N'], ('rank', 'object'): ['10', '7', '7', '8', '10', '8'], ('Cybertronian', 'object'): ['TR', 'TR', 'TR', 'TR', 'TR', 'FLS'], ('Date Type', 'object'): ['201-10', '201-10', '201-24', '201-24', '201-10', '201-10'], ('age', 'object'): ['50', '50', '50', '50', '50', '50'], ('function', 'object'): ['LDR', 'ESPNG', 'SCRTY', 'FRSTNT', 'N', 'BTLSTN'], ('names', 'object'): ['OPTMS', 'BMB#BÉ', 'MTRPLX', 'BMBLB', 'MÉTL-X', 'MTR)^$'], ('timestamp', 'object'): ['201-24', '201-24', '201-24', '201-24', '201-24', '201-24'], ('weight(t)', 'object'): ['4.3', '2.0', '4.0', '1.8', '5.7', 'N']}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_match_rating_codex_multiple(self): + df = self.df.copy() + result = df.cols.match_rating_codex(cols=['NullType', 'Cybertronian', 'timestamp'], output_cols=['nt', 'ct', 'ts']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('nt', 'object'): ['N', 'N', 'N', 'N', 'N', 'N'], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('ct', 'object'): ['TR', 'TR', 'TR', 'TR', 'TR', 'FLS'], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('ts', 'object'): ['201-24', '201-24', '201-24', '201-24', '201-24', '201-24'], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_match_rating_codex_numeric(self): + df = self.df.copy() + result = df.cols.match_rating_codex(cols=['rank'], output_cols=['rk']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('rk', 'object'): ['10', '7', '7', '8', '10', '8'], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_match_rating_codex_string(self): + df = self.df.copy() + result = df.cols.match_rating_codex(cols=['names']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['OPTMS', 'BMB#BÉ', 'MTRPLX', 'BMBLB', 'MÉTL-X', 'MTR)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_metaphone_all(self): + df = self.df.copy() + result = df.cols.metaphone(cols='*') + expected = self.create_dataframe(data={('NullType', 'object'): ['NN', 'NN', 'NN', 'NN', 'NN', 'NN'], ('date arrival', 'object'): ['', '', '', '', '', ''], ('height(ft)', 'object'): ['', '', '', '', 'NN', ''], ('last date seen', 'object'): ['', '', '', '', '', ''], ('last position seen', 'object'): ['', '', '', '', 'NN', 'NN'], ('rank', 'object'): ['', '', '', '', '', ''], ('Cybertronian', 'object'): ['TR', 'TR', 'TR', 'TR', 'TR', 'FLS'], ('Date Type', 'object'): ['', '', '', '', '', ''], ('age', 'object'): ['', '', '', '', '', ''], ('function', 'object'): ['LTR', 'ESPNJ', 'SKRT', 'FRST LTNNT', 'NN', 'BTL STXN'], ('names', 'object'): ['OPTMS', 'BMBLB ', 'MTRPLKS', 'BMBLB', 'MTRP LKS', 'MTRPLKS'], ('timestamp', 'object'): ['', '', '', '', '', ''], ('weight(t)', 'object'): ['', '', '', '', '', 'NN']}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_metaphone_multiple(self): + df = self.df.copy() + result = df.cols.metaphone(cols=['NullType', 'Cybertronian', 'timestamp'], output_cols=['nt', 'ct', 'ts']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('nt', 'object'): ['NN', 'NN', 'NN', 'NN', 'NN', 'NN'], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('ct', 'object'): ['TR', 'TR', 'TR', 'TR', 'TR', 'FLS'], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('ts', 'object'): ['', '', '', '', '', ''], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_metaphone_numeric(self): + df = self.df.copy() + result = df.cols.metaphone(cols=['rank'], output_cols=['rk']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('rk', 'object'): ['', '', '', '', '', ''], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_metaphone_string(self): + df = self.df.copy() + result = df.cols.metaphone(cols=['names']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['OPTMS', 'BMBLB ', 'MTRPLKS', 'BMBLB', 'MTRP LKS', 'MTRPLKS'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_ngram_fingerprint_all(self): + df = self.df.copy() + result = df.cols.ngram_fingerprint(cols='*') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_ngram_fingerprint_multiple(self): + df = self.df.copy() + result = df.cols.ngram_fingerprint(cols=['NullType', 'Cybertronian', 'timestamp'], n_size=4, output_cols=['nt', 'ct', 'ts']) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_ngram_fingerprint_numeric(self): + df = self.df.copy() + result = df.cols.ngram_fingerprint(cols=['rank'], output_cols=['rk']) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_ngram_fingerprint_string(self): + df = self.df.copy() + result = df.cols.ngram_fingerprint(cols=['function(binary)'], n_size=25) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_ngrams_multiple(self): + df = self.df.copy() + result = df.cols.ngrams(cols=['date arrival', 'japanese name', 'last date seen'], n_size=1, output_cols=['da', 'jn', 'lds']) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_ngrams_single(self): + df = self.df.copy() + result = df.cols.ngrams(cols=['names']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): [['Op', 'pt', 'ti', 'im', 'mu', 'us'], ['bu', 'um', 'mb', 'bl', 'l#', '#e', 'eb', 'bé', 'éé', 'é ', ' '], ['Me', 'et', 'tr', 'ro', 'op', 'pl', 'le', 'ex'], ['bu', 'um', 'mb', 'bl', 'le', 'eb', 'be', 'ee'], ['mé', 'ét', 'tr', 'ro', 'op', 'p´', '´l', 'le', 'e-', '-x'], ['Me', 'et', 'tr', 'ro', 'op', 'pl', 'le', 'ex', 'x_', '_)', ')^', '^$']], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_nysiis_all(self): + df = self.df.copy() + result = df.cols.nysiis(cols='*') + expected = self.create_dataframe(data={('NullType', 'object'): ['NAN', 'NAN', 'NAN', 'NAN', 'NAN', 'NAN'], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'object'): ['-28.0', '17.0', '26.0', '13.0', 'NAN', '30.0'], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '201/04/10'], ('last position seen', 'object'): ['19.42735,-9.201', '10.642707,-71.612534', '37.789563,-12.40356', '3.6706,-17.84153', 'NAN', 'NAN'], ('rank', 'object'): ['10', '7', '7', '8', '10', '8'], ('Cybertronian', 'object'): ['TR', 'TR', 'TR', 'TR', 'TR', 'FALS'], ('Date Type', 'object'): ['2016-09-10', '2015-08-10', '2014-06-24', '2013-06-24', '2012-05-10', '201-04-10'], ('age', 'object'): ['50', '50', '50', '50', '50', '50'], ('function', 'object'): ['LADAR', 'ESPANAG', 'SACARATY', 'FARST', 'NAN', 'BATL'], ('names', 'object'): ['OPTAN', 'BANBL#ABÉ', 'MATRAPLAX', 'BANBLABY', 'MÉTRAP´LA-X', 'MATRAPLAX_)^$'], ('timestamp', 'object'): ['2014-06-24', '2014-06-24', '2014-06-24', '2014-06-24', '2014-06-24', '2014-06-24'], ('weight(t)', 'object'): ['4.3', '2.0', '4.0', '1.8', '5.7', 'NAN']}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_nysiis_multiple(self): + df = self.df.copy() + result = df.cols.nysiis(cols=['NullType', 'Cybertronian', 'timestamp'], output_cols=['nt', 'ct', 'ts']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('nt', 'object'): ['NAN', 'NAN', 'NAN', 'NAN', 'NAN', 'NAN'], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('ct', 'object'): ['TR', 'TR', 'TR', 'TR', 'TR', 'FALS'], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('ts', 'object'): ['2014-06-24', '2014-06-24', '2014-06-24', '2014-06-24', '2014-06-24', '2014-06-24'], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_nysiis_numeric(self): + df = self.df.copy() + result = df.cols.nysiis(cols=['rank'], output_cols=['rk']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('rk', 'object'): ['10', '7', '7', '8', '10', '8'], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_nysiis_string(self): + df = self.df.copy() + result = df.cols.nysiis(cols=['names']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['OPTAN', 'BANBL#ABÉ', 'MATRAPLAX', 'BANBLABY', 'MÉTRAP´LA-X', 'MATRAPLAX_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_pos_multiple(self): + df = self.df.copy() + result = df.cols.pos(cols=['date arrival', 'japanese name', 'last date seen'], output_cols=['da', 'jn', 'lds']) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_pos_single(self): + df = self.df.copy() + result = df.cols.pos(cols=['names']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): [[('Optimus', 'NN')], [('bumbl#ebéé', 'NN')], [('Metroplex', 'NNP')], [('bumblebee', 'NN')], [('métrop´le-x', 'NN')], [('Metroplex_)^$', 'NN')]], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_soundex_all(self): + df = self.df.copy() + result = df.cols.soundex(cols='*') + expected = self.create_dataframe(data={('NullType', 'object'): ['N500', 'N500', 'N500', 'N500', 'N500', 'N500'], ('date arrival', 'object'): ['1000', '1000', '1000', '1000', '1000', '1000'], ('height(ft)', 'object'): ['-000', '1000', '2000', '1000', 'N500', '3000'], ('last date seen', 'object'): ['2000', '2000', '2000', '2000', '2000', '2000'], ('last position seen', 'object'): ['1000', '1000', '3000', '3000', 'N500', 'N500'], ('rank', 'object'): ['1000', '7000', '7000', '8000', '1000', '8000'], ('Cybertronian', 'object'): ['T600', 'T600', 'T600', 'T600', 'T600', 'F420'], ('Date Type', 'object'): ['2000', '2000', '2000', '2000', '2000', '2000'], ('age', 'object'): ['5000', '5000', '5000', '5000', '5000', '5000'], ('function', 'object'): ['L360', 'E215', 'S263', 'F623', 'N500', 'B342'], ('names', 'object'): ['O135', 'B514', 'M361', 'B514', 'M361', 'M361'], ('timestamp', 'object'): ['2000', '2000', '2000', '2000', '2000', '2000'], ('weight(t)', 'object'): ['4000', '2000', '4000', '1000', '5000', 'N500']}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_soundex_multiple(self): + df = self.df.copy() + result = df.cols.soundex(cols=['NullType', 'Cybertronian', 'timestamp'], output_cols=['nt', 'ct', 'ts']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('nt', 'object'): ['N500', 'N500', 'N500', 'N500', 'N500', 'N500'], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('ct', 'object'): ['T600', 'T600', 'T600', 'T600', 'T600', 'F420'], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('ts', 'object'): ['2000', '2000', '2000', '2000', '2000', '2000'], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_soundex_numeric(self): + df = self.df.copy() + result = df.cols.soundex(cols=['rank'], output_cols=['rk']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('rk', 'object'): ['1000', '7000', '7000', '8000', '1000', '8000'], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['Optimus', 'bumbl#ebéé ', 'Metroplex', 'bumblebee', 'métrop´le-x', 'Metroplex_)^$'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_cols_soundex_string(self): + df = self.df.copy() + result = df.cols.soundex(cols=['names']) + expected = self.create_dataframe(data={('NullType', 'object'): [None, None, None, None, None, None], ('date arrival', 'object'): ['1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10', '1980/04/10'], ('height(ft)', 'float64'): [-28.0, 17.0, 26.0, 13.0, nan, 300.0], ('last date seen', 'object'): ['2016/09/10', '2015/08/10', '2014/07/10', '2013/06/10', '2012/05/10', '2011/04/10'], ('last position seen', 'object'): ['19.442735,-99.201111', '10.642707,-71.612534', '37.789563,-122.400356', '33.670666,-117.841553', None, None], ('rank', 'int64'): [10, 7, 7, 8, 10, 8], ('Cybertronian', 'bool'): [True, True, True, True, True, False], ('Date Type', 'datetime64[ns]'): [Timestamp('2016-09-10 00:00:00'), Timestamp('2015-08-10 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2013-06-24 00:00:00'), Timestamp('2012-05-10 00:00:00'), Timestamp('2011-04-10 00:00:00')], ('age', 'int64'): [5000000, 5000000, 5000000, 5000000, 5000000, 5000000], ('function', 'object'): ['Leader', 'Espionage', 'Security', 'First Lieutenant', None, 'Battle Station'], ('names', 'object'): ['O135', 'B514', 'M361', 'B514', 'M361', 'M361'], ('timestamp', 'datetime64[ns]'): [Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00'), Timestamp('2014-06-24 00:00:00')], ('weight(t)', 'float64'): [4.3, 2.0, 4.0, 1.8, 5.7, nan]}, force_data_types=True) + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + def test_string_clustering_all_double_metaphone(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols='*', algorithm='double_metaphone') - expected = { 'Cybertronian': { ('FLS', ''): { 'suggestion': False, - 'suggestions': [False], - 'suggestions_size': 1, - 'total_count': 1}, - ('TR', ''): { 'suggestion': True, - 'suggestions': [True], - 'suggestions_size': 1, - 'total_count': 5}}, - 'Date Type': { ('', ''): { 'suggestion': Timestamp('2016-09-10 00:00:00'), - 'suggestions': [ Timestamp('2016-09-10 00:00:00'), - Timestamp('2015-08-10 00:00:00'), - Timestamp('2014-06-24 00:00:00'), - Timestamp('2013-06-24 00:00:00'), - Timestamp('2012-05-10 00:00:00'), - Timestamp('2011-04-10 00:00:00')], - 'suggestions_size': 6, - 'total_count': 6}}, - 'NullType': { ('NN', ''): { 'suggestion': None, - 'suggestions': [None], - 'suggestions_size': 1, - 'total_count': 6}}, - 'age': { ('', ''): { 'suggestion': 5000000, - 'suggestions': [5000000], + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_all_fingerprint(self): + df = self.df.copy() + result = df.string_clustering(cols='*', algorithm='fingerprint') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_all_levenshtein(self): + df = self.df.copy() + result = df.string_clustering(cols='*', algorithm='levenshtein') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_all_match_rating_codex(self): + df = self.df.copy() + result = df.string_clustering(cols='*', algorithm='match_rating_codex') + expected = { 'Cybertronian': { 'FLS': { 'suggestion': 'False', + 'suggestions': ['False'], + 'suggestions_size': 1, + 'total_count': 1}, + 'TR': { 'suggestion': 'True', + 'suggestions': ['True'], + 'suggestions_size': 1, + 'total_count': 5}}, + 'Date Type': { '201-10': { 'suggestion': '2016-09-10', + 'suggestions': [ '2016-09-10', + '2015-08-10', + '2012-05-10', + '2011-04-10'], + 'suggestions_size': 4, + 'total_count': 4}, + '201-24': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24', '2013-06-24'], + 'suggestions_size': 2, + 'total_count': 2}}, + 'NullType': { 'N': { 'suggestion': 'None', + 'suggestions': ['None'], 'suggestions_size': 1, 'total_count': 6}}, - 'date arrival': { ('', ''): { 'suggestion': '1980/04/10', + 'age': { '50': { 'suggestion': '5000000', + 'suggestions': ['5000000'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'date arrival': { '198/10': { 'suggestion': '1980/04/10', 'suggestions': ['1980/04/10'], 'suggestions_size': 1, 'total_count': 6}}, - 'function': { ('ASPNJ', 'ASPNK'): { 'suggestion': 'Espionage', - 'suggestions': ['Espionage'], - 'suggestions_size': 1, - 'total_count': 1}, - ('FRSTLTNNT', ''): { 'suggestion': 'First Lieutenant', - 'suggestions': ['First Lieutenant'], - 'suggestions_size': 1, - 'total_count': 1}, - ('LTR', ''): { 'suggestion': 'Leader', - 'suggestions': ['Leader'], - 'suggestions_size': 1, - 'total_count': 1}, - ('NN', ''): { 'suggestion': None, - 'suggestions': [None], - 'suggestions_size': 1, - 'total_count': 1}, - ('PTLSTXN', ''): { 'suggestion': 'Battle Station', - 'suggestions': ['Battle Station'], - 'suggestions_size': 1, - 'total_count': 1}, - ('SKRT', ''): { 'suggestion': 'Security', - 'suggestions': ['Security'], - 'suggestions_size': 1, - 'total_count': 1}}, - 'height(ft)': { ('', ''): { 'suggestion': -28.0, - 'suggestions': [-28.0, 17.0, 26.0, 13.0, 300.0], - 'suggestions_size': 5, - 'total_count': 5}, - ('NN', ''): { 'suggestion': nan, - 'suggestions': [nan], - 'suggestions_size': 1, - 'total_count': 1}}, - 'last date seen': { ('', ''): { 'suggestion': '2016/09/10', - 'suggestions': [ '2016/09/10', - '2015/08/10', - '2014/07/10', - '2013/06/10', - '2012/05/10', - '2011/04/10'], - 'suggestions_size': 6, - 'total_count': 6}}, - 'last position seen': { ('', ''): { 'suggestion': '19.442735,-99.201111', - 'suggestions': [ '19.442735,-99.201111', - '10.642707,-71.612534', - '37.789563,-122.400356', - '33.670666,-117.841553'], - 'suggestions_size': 4, - 'total_count': 4}, - ('NN', ''): { 'suggestion': None, - 'suggestions': [None], - 'suggestions_size': 1, - 'total_count': 2}}, - 'names': { ('APTMS', ''): { 'suggestion': 'Optimus', - 'suggestions': ['Optimus'], - 'suggestions_size': 1, - 'total_count': 1}, - ('ARNT', ''): { 'suggestion': 'ironhide&', - 'suggestions': ['ironhide&'], - 'suggestions_size': 1, - 'total_count': 1}, - ('JS', 'AS'): { 'suggestion': 'Jazz', - 'suggestions': ['Jazz'], - 'suggestions_size': 1, - 'total_count': 1}, - ('MKTRN', ''): { 'suggestion': 'Megatron', - 'suggestions': ['Megatron'], - 'suggestions_size': 1, - 'total_count': 1}, - ('MTRPLKSKSKSKSKS', ''): { 'suggestion': 'Metroplex_)^$', - 'suggestions': ['Metroplex_)^$'], - 'suggestions_size': 1, - 'total_count': 1}, - ('PMPLLP', ''): { 'suggestion': 'bumbl#ebéé ', - 'suggestions': ['bumbl#ebéé '], - 'suggestions_size': 1, - 'total_count': 1}}, - 'rank': { ('', ''): { 'suggestion': 10, - 'suggestions': [10, 7, 8], - 'suggestions_size': 3, - 'total_count': 6}}, - 'timestamp': { ('', ''): { 'suggestion': Timestamp('2014-06-24 00:00:00'), - 'suggestions': [Timestamp('2014-06-24 00:00:00')], - 'suggestions_size': 1, - 'total_count': 6}}, - 'weight(t)': { ('', ''): { 'suggestion': 4.3, - 'suggestions': [4.3, 2.0, 4.0, 1.8, 5.7], - 'suggestions_size': 5, - 'total_count': 5}, - ('NN', ''): { 'suggestion': nan, - 'suggestions': [nan], - 'suggestions_size': 1, - 'total_count': 1}}} - self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) - - def test_string_clustering_all_fingerprint(self): - df = self.df - result = df.string_clustering(cols='*', algorithm='fingerprint') - expected = { 'Cybertronian': { 'false': { 'suggestion': False, - 'suggestions': [False], - 'suggestions_size': 1, - 'total_count': 1}, - 'true': { 'suggestion': True, - 'suggestions': [True], - 'suggestions_size': 1, - 'total_count': 5}}, - 'Date Type': { '20110410': { 'suggestion': Timestamp('2011-04-10 00:00:00'), - 'suggestions': [ Timestamp('2011-04-10 00:00:00')], - 'suggestions_size': 1, - 'total_count': 1}, - '20120510': { 'suggestion': Timestamp('2012-05-10 00:00:00'), - 'suggestions': [ Timestamp('2012-05-10 00:00:00')], - 'suggestions_size': 1, - 'total_count': 1}, - '20130624': { 'suggestion': Timestamp('2013-06-24 00:00:00'), - 'suggestions': [ Timestamp('2013-06-24 00:00:00')], - 'suggestions_size': 1, - 'total_count': 1}, - '20140624': { 'suggestion': Timestamp('2014-06-24 00:00:00'), - 'suggestions': [ Timestamp('2014-06-24 00:00:00')], - 'suggestions_size': 1, - 'total_count': 1}, - '20150810': { 'suggestion': Timestamp('2015-08-10 00:00:00'), - 'suggestions': [ Timestamp('2015-08-10 00:00:00')], - 'suggestions_size': 1, - 'total_count': 1}, - '20160910': { 'suggestion': Timestamp('2016-09-10 00:00:00'), - 'suggestions': [ Timestamp('2016-09-10 00:00:00')], - 'suggestions_size': 1, - 'total_count': 1}}, - 'NullType': { 'none': { 'suggestion': None, - 'suggestions': [None], - 'suggestions_size': 1, - 'total_count': 6}}, - 'age': { '5000000': { 'suggestion': 5000000, - 'suggestions': [5000000], - 'suggestions_size': 1, - 'total_count': 6}}, - 'date arrival': { '19800410': { 'suggestion': '1980/04/10', - 'suggestions': ['1980/04/10'], - 'suggestions_size': 1, - 'total_count': 6}}, - 'function': { 'battle station': { 'suggestion': 'Battle Station', - 'suggestions': ['Battle Station'], - 'suggestions_size': 1, - 'total_count': 1}, - 'espionage': { 'suggestion': 'Espionage', - 'suggestions': ['Espionage'], - 'suggestions_size': 1, - 'total_count': 1}, - 'first lieutenant': { 'suggestion': 'First Lieutenant', - 'suggestions': ['First Lieutenant'], - 'suggestions_size': 1, - 'total_count': 1}, - 'leader': { 'suggestion': 'Leader', - 'suggestions': ['Leader'], + 'function': { 'BTLSTN': { 'suggestion': 'Battle Station', + 'suggestions': ['Battle Station'], 'suggestions_size': 1, 'total_count': 1}, - 'none': { 'suggestion': None, - 'suggestions': [None], - 'suggestions_size': 1, - 'total_count': 1}, - 'security': { 'suggestion': 'Security', - 'suggestions': ['Security'], - 'suggestions_size': 1, - 'total_count': 1}}, - 'height(ft)': { '130': { 'suggestion': 13.0, - 'suggestions': [13.0], - 'suggestions_size': 1, - 'total_count': 1}, - '170': { 'suggestion': 17.0, - 'suggestions': [17.0], - 'suggestions_size': 1, - 'total_count': 1}, - '260': { 'suggestion': 26.0, - 'suggestions': [26.0], - 'suggestions_size': 1, - 'total_count': 1}, - '280': { 'suggestion': -28.0, - 'suggestions': [-28.0], + 'ESPNG': { 'suggestion': 'Espionage', + 'suggestions': ['Espionage'], 'suggestions_size': 1, 'total_count': 1}, - '3000': { 'suggestion': 300.0, - 'suggestions': [300.0], + 'FRSTNT': { 'suggestion': 'First Lieutenant', + 'suggestions': ['First Lieutenant'], 'suggestions_size': 1, 'total_count': 1}, - 'nan': { 'suggestion': nan, - 'suggestions': [nan], + 'LDR': { 'suggestion': 'Leader', + 'suggestions': ['Leader'], + 'suggestions_size': 1, + 'total_count': 1}, + 'N': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 1}, + 'SCRTY': { 'suggestion': 'Security', + 'suggestions': ['Security'], 'suggestions_size': 1, 'total_count': 1}}, - 'last date seen': { '20110410': { 'suggestion': '2011/04/10', - 'suggestions': ['2011/04/10'], - 'suggestions_size': 1, - 'total_count': 1}, - '20120510': { 'suggestion': '2012/05/10', - 'suggestions': ['2012/05/10'], - 'suggestions_size': 1, - 'total_count': 1}, - '20130610': { 'suggestion': '2013/06/10', - 'suggestions': ['2013/06/10'], - 'suggestions_size': 1, - 'total_count': 1}, - '20140710': { 'suggestion': '2014/07/10', - 'suggestions': ['2014/07/10'], - 'suggestions_size': 1, - 'total_count': 1}, - '20150810': { 'suggestion': '2015/08/10', - 'suggestions': ['2015/08/10'], - 'suggestions_size': 1, - 'total_count': 1}, - '20160910': { 'suggestion': '2016/09/10', - 'suggestions': ['2016/09/10'], - 'suggestions_size': 1, - 'total_count': 1}}, - 'last position seen': { '1064270771612534': { 'suggestion': '10.642707,-71.612534', - 'suggestions': [ '10.642707,-71.612534'], - 'suggestions_size': 1, - 'total_count': 1}, - '1944273599201111': { 'suggestion': '19.442735,-99.201111', - 'suggestions': [ '19.442735,-99.201111'], - 'suggestions_size': 1, - 'total_count': 1}, - '33670666117841553': { 'suggestion': '33.670666,-117.841553', - 'suggestions': [ '33.670666,-117.841553'], - 'suggestions_size': 1, - 'total_count': 1}, - '37789563122400356': { 'suggestion': '37.789563,-122.400356', - 'suggestions': [ '37.789563,-122.400356'], - 'suggestions_size': 1, - 'total_count': 1}, - 'none': { 'suggestion': None, - 'suggestions': [None], - 'suggestions_size': 1, - 'total_count': 2}}, - 'names': { 'bumblebee': { 'suggestion': 'bumbl#ebéé ', - 'suggestions': ['bumbl#ebéé '], + 'height(ft)': { '-28.0': { 'suggestion': '-28.0', + 'suggestions': ['-28.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '13.0': { 'suggestion': '13.0', + 'suggestions': ['13.0'], 'suggestions_size': 1, 'total_count': 1}, - 'ironhide': { 'suggestion': 'ironhide&', - 'suggestions': ['ironhide&'], - 'suggestions_size': 1, - 'total_count': 1}, - 'jazz': { 'suggestion': 'Jazz', - 'suggestions': ['Jazz'], - 'suggestions_size': 1, - 'total_count': 1}, - 'megatron': { 'suggestion': 'Megatron', - 'suggestions': ['Megatron'], - 'suggestions_size': 1, - 'total_count': 1}, - 'metroplex': { 'suggestion': 'Metroplex_)^$', - 'suggestions': ['Metroplex_)^$'], + '17.0': { 'suggestion': '17.0', + 'suggestions': ['17.0'], 'suggestions_size': 1, 'total_count': 1}, - 'optimus': { 'suggestion': 'Optimus', - 'suggestions': ['Optimus'], - 'suggestions_size': 1, - 'total_count': 1}}, - 'rank': { '10': { 'suggestion': 10, - 'suggestions': [10], - 'suggestions_size': 1, - 'total_count': 2}, - '7': { 'suggestion': 7, - 'suggestions': [7], - 'suggestions_size': 1, - 'total_count': 2}, - '8': { 'suggestion': 8, - 'suggestions': [8], - 'suggestions_size': 1, - 'total_count': 2}}, - 'timestamp': { '20140624': { 'suggestion': Timestamp('2014-06-24 00:00:00'), - 'suggestions': [ Timestamp('2014-06-24 00:00:00')], - 'suggestions_size': 1, - 'total_count': 6}}, - 'weight(t)': { '18': { 'suggestion': 1.8, - 'suggestions': [1.8], + '26.0': { 'suggestion': '26.0', + 'suggestions': ['26.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '30.0': { 'suggestion': '300.0', + 'suggestions': ['300.0'], + 'suggestions_size': 1, + 'total_count': 1}, + 'N': { 'suggestion': 'nan', + 'suggestions': ['nan'], 'suggestions_size': 1, - 'total_count': 1}, - '20': { 'suggestion': 2.0, - 'suggestions': [2.0], + 'total_count': 1}}, + 'last date seen': { '201/10': { 'suggestion': '2016/09/10', + 'suggestions': [ '2016/09/10', + '2015/08/10', + '2014/07/10', + '2013/06/10', + '2012/05/10', + '2011/04/10'], + 'suggestions_size': 6, + 'total_count': 6}}, + 'last position seen': { '10.534': { 'suggestion': '10.642707,-71.612534', + 'suggestions': ['10.642707,-71.612534'], + 'suggestions_size': 1, + 'total_count': 1}, + '19.201': { 'suggestion': '19.442735,-99.201111', + 'suggestions': ['19.442735,-99.201111'], + 'suggestions_size': 1, + 'total_count': 1}, + '3.6153': { 'suggestion': '33.670666,-117.841553', + 'suggestions': ['33.670666,-117.841553'], + 'suggestions_size': 1, + 'total_count': 1}, + '37.356': { 'suggestion': '37.789563,-122.400356', + 'suggestions': ['37.789563,-122.400356'], + 'suggestions_size': 1, + 'total_count': 1}, + 'N': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 2}}, + 'names': { 'BMB#BÉ': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], 'suggestions_size': 1, 'total_count': 1}, - '40': { 'suggestion': 4.0, - 'suggestions': [4.0], + 'BMBLB': { 'suggestion': 'bumblebee', + 'suggestions': ['bumblebee'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MTR)^$': { 'suggestion': 'Metroplex_)^$', + 'suggestions': ['Metroplex_)^$'], 'suggestions_size': 1, 'total_count': 1}, - '43': { 'suggestion': 4.3, - 'suggestions': [4.3], + 'MTRPLX': { 'suggestion': 'Metroplex', + 'suggestions': ['Metroplex'], 'suggestions_size': 1, 'total_count': 1}, - '57': { 'suggestion': 5.7, - 'suggestions': [5.7], + 'MÉTL-X': { 'suggestion': 'métrop´le-x', + 'suggestions': ['métrop´le-x'], 'suggestions_size': 1, 'total_count': 1}, - 'nan': { 'suggestion': nan, - 'suggestions': [nan], + 'OPTMS': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'rank': { '10': { 'suggestion': '10', + 'suggestions': ['10'], + 'suggestions_size': 1, + 'total_count': 2}, + '7': { 'suggestion': '7', + 'suggestions': ['7'], + 'suggestions_size': 1, + 'total_count': 2}, + '8': { 'suggestion': '8', + 'suggestions': ['8'], + 'suggestions_size': 1, + 'total_count': 2}}, + 'timestamp': { '201-24': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'weight(t)': { '1.8': { 'suggestion': '1.8', + 'suggestions': ['1.8'], 'suggestions_size': 1, - 'total_count': 1}}} - self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) - - def test_string_clustering_all_levenshtein(self): - df = self.df - result = df.string_clustering(cols='*', algorithm='levenshtein') - expected = { 'weight(t)': { '18': { 'suggestion': '18', - 'suggestions': ['18', '43'], - 'suggestions_size': 2, - 'total_count': 6}, - '20': { 'suggestion': '20', - 'suggestions': ['20', '40'], - 'suggestions_size': 2, - 'total_count': 6}, - '40': { 'suggestion': '40', - 'suggestions': ['40', '43'], - 'suggestions_size': 2, - 'total_count': 6}, - '43': { 'suggestion': '43', - 'suggestions': ['43', '40'], - 'suggestions_size': 2, - 'total_count': 6}, - '57': { 'suggestion': '57', - 'suggestions': ['57', '43'], - 'suggestions_size': 2, - 'total_count': 6}, - 'nan': { 'suggestion': 'nan', - 'suggestions': ['nan', '43'], - 'suggestions_size': 2, - 'total_count': 6}}} + 'total_count': 1}, + '2.0': { 'suggestion': '2.0', + 'suggestions': ['2.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '4.0': { 'suggestion': '4.0', + 'suggestions': ['4.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '4.3': { 'suggestion': '4.3', + 'suggestions': ['4.3'], + 'suggestions_size': 1, + 'total_count': 1}, + '5.7': { 'suggestion': '5.7', + 'suggestions': ['5.7'], + 'suggestions_size': 1, + 'total_count': 1}, + 'N': { 'suggestion': 'nan', + 'suggestions': ['nan'], + 'suggestions_size': 1, + 'total_count': 1}}} self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) - def test_string_clustering_all_match_rating_codex(self): - df = self.df - result = df.string_clustering(cols='*', algorithm='match_rating_codex') - # The following value does not represent a correct output of the operation - expected = self.dict - self.assertTrue(result.equals(expected, decimal=True, assertion=True)) - def test_string_clustering_all_metaphone(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols='*', algorithm='metaphone') - expected = { 'Cybertronian': { 'FLS': { 'suggestion': False, - 'suggestions': [False], + expected = { 'Cybertronian': { 'FLS': { 'suggestion': 'False', + 'suggestions': ['False'], 'suggestions_size': 1, 'total_count': 1}, - 'TR': { 'suggestion': True, - 'suggestions': [True], + 'TR': { 'suggestion': 'True', + 'suggestions': ['True'], 'suggestions_size': 1, 'total_count': 5}}, - 'Date Type': { '': { 'suggestion': Timestamp('2016-09-10 00:00:00'), - 'suggestions': [ Timestamp('2016-09-10 00:00:00'), - Timestamp('2015-08-10 00:00:00'), - Timestamp('2014-06-24 00:00:00'), - Timestamp('2013-06-24 00:00:00'), - Timestamp('2012-05-10 00:00:00'), - Timestamp('2011-04-10 00:00:00')], + 'Date Type': { '': { 'suggestion': '2016-09-10', + 'suggestions': [ '2016-09-10', + '2015-08-10', + '2014-06-24', + '2013-06-24', + '2012-05-10', + '2011-04-10'], 'suggestions_size': 6, 'total_count': 6}}, - 'NullType': { 'NN': { 'suggestion': None, - 'suggestions': [None], + 'NullType': { 'NN': { 'suggestion': 'None', + 'suggestions': ['None'], 'suggestions_size': 1, 'total_count': 6}}, - 'age': { '': { 'suggestion': 5000000, - 'suggestions': [5000000], + 'age': { '': { 'suggestion': '5000000', + 'suggestions': ['5000000'], 'suggestions_size': 1, 'total_count': 6}}, 'date arrival': { '': { 'suggestion': '1980/04/10', @@ -432,20 +505,24 @@ def test_string_clustering_all_metaphone(self): 'suggestions': ['Leader'], 'suggestions_size': 1, 'total_count': 1}, - 'NN': { 'suggestion': None, - 'suggestions': [None], + 'NN': { 'suggestion': 'None', + 'suggestions': ['None'], 'suggestions_size': 1, 'total_count': 1}, 'SKRT': { 'suggestion': 'Security', 'suggestions': ['Security'], 'suggestions_size': 1, 'total_count': 1}}, - 'height(ft)': { '': { 'suggestion': -28.0, - 'suggestions': [-28.0, 17.0, 26.0, 13.0, 300.0], + 'height(ft)': { '': { 'suggestion': '-28.0', + 'suggestions': [ '-28.0', + '17.0', + '26.0', + '13.0', + '300.0'], 'suggestions_size': 5, 'total_count': 5}, - 'NN': { 'suggestion': nan, - 'suggestions': [nan], + 'NN': { 'suggestion': 'nan', + 'suggestions': ['nan'], 'suggestions_size': 1, 'total_count': 1}}, 'last date seen': { '': { 'suggestion': '2016/09/10', @@ -464,288 +541,286 @@ def test_string_clustering_all_metaphone(self): '33.670666,-117.841553'], 'suggestions_size': 4, 'total_count': 4}, - 'NN': { 'suggestion': None, - 'suggestions': [None], + 'NN': { 'suggestion': 'None', + 'suggestions': ['None'], 'suggestions_size': 1, 'total_count': 2}}, - 'names': { 'BMBLB ': { 'suggestion': 'bumbl#ebéé ', + 'names': { 'BMBLB': { 'suggestion': 'bumblebee', + 'suggestions': ['bumblebee'], + 'suggestions_size': 1, + 'total_count': 1}, + 'BMBLB ': { 'suggestion': 'bumbl#ebéé ', 'suggestions': ['bumbl#ebéé '], 'suggestions_size': 1, 'total_count': 1}, - 'IRNHT': { 'suggestion': 'ironhide&', - 'suggestions': ['ironhide&'], - 'suggestions_size': 1, - 'total_count': 1}, - 'JS': { 'suggestion': 'Jazz', - 'suggestions': ['Jazz'], - 'suggestions_size': 1, - 'total_count': 1}, - 'MKTRN': { 'suggestion': 'Megatron', - 'suggestions': ['Megatron'], - 'suggestions_size': 1, - 'total_count': 1}, - 'MTRPLKS': { 'suggestion': 'Metroplex_)^$', - 'suggestions': ['Metroplex_)^$'], - 'suggestions_size': 1, - 'total_count': 1}, + 'MTRP LKS': { 'suggestion': 'métrop´le-x', + 'suggestions': ['métrop´le-x'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MTRPLKS': { 'suggestion': 'Metroplex', + 'suggestions': ['Metroplex', 'Metroplex_)^$'], + 'suggestions_size': 2, + 'total_count': 2}, 'OPTMS': { 'suggestion': 'Optimus', 'suggestions': ['Optimus'], 'suggestions_size': 1, 'total_count': 1}}, - 'rank': { '': { 'suggestion': 10, - 'suggestions': [10, 7, 8], + 'rank': { '': { 'suggestion': '10', + 'suggestions': ['10', '7', '8'], 'suggestions_size': 3, 'total_count': 6}}, - 'timestamp': { '': { 'suggestion': Timestamp('2014-06-24 00:00:00'), - 'suggestions': [Timestamp('2014-06-24 00:00:00')], + 'timestamp': { '': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24'], 'suggestions_size': 1, 'total_count': 6}}, - 'weight(t)': { '': { 'suggestion': 4.3, - 'suggestions': [4.3, 2.0, 4.0, 1.8, 5.7], + 'weight(t)': { '': { 'suggestion': '4.3', + 'suggestions': ['4.3', '2.0', '4.0', '1.8', '5.7'], 'suggestions_size': 5, 'total_count': 5}, - 'NN': { 'suggestion': nan, - 'suggestions': [nan], + 'NN': { 'suggestion': 'nan', + 'suggestions': ['nan'], 'suggestions_size': 1, 'total_count': 1}}} self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) def test_string_clustering_all_ngram_fingerprint(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols='*', algorithm='ngram_fingerprint') - expected = { 'Cybertronian': { 'alfalsse': { 'suggestion': False, - 'suggestions': [False], - 'suggestions_size': 1, - 'total_count': 1}, - 'rutrue': { 'suggestion': True, - 'suggestions': [True], - 'suggestions_size': 1, - 'total_count': 5}}, - 'Date Type': { '010410112041': { 'suggestion': Timestamp('2011-04-10 00:00:00'), - 'suggestions': [ Timestamp('2011-04-10 00:00:00')], - 'suggestions_size': 1, - 'total_count': 1}, - '010510122051': { 'suggestion': Timestamp('2012-05-10 00:00:00'), - 'suggestions': [ Timestamp('2012-05-10 00:00:00')], - 'suggestions_size': 1, - 'total_count': 1}, - '01061320243062': { 'suggestion': Timestamp('2013-06-24 00:00:00'), - 'suggestions': [ Timestamp('2013-06-24 00:00:00')], - 'suggestions_size': 1, - 'total_count': 1}, - '01061420244062': { 'suggestion': Timestamp('2014-06-24 00:00:00'), - 'suggestions': [ Timestamp('2014-06-24 00:00:00')], - 'suggestions_size': 1, - 'total_count': 1}, - '01081015205081': { 'suggestion': Timestamp('2015-08-10 00:00:00'), - 'suggestions': [ Timestamp('2015-08-10 00:00:00')], - 'suggestions_size': 1, - 'total_count': 1}, - '01091016206091': { 'suggestion': Timestamp('2016-09-10 00:00:00'), - 'suggestions': [ Timestamp('2016-09-10 00:00:00')], - 'suggestions_size': 1, - 'total_count': 1}}, - 'NullType': { 'nenoon': { 'suggestion': None, - 'suggestions': [None], + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_all_nysiis(self): + df = self.df.copy() + result = df.string_clustering(cols='*', algorithm='nysiis') + expected = { 'Cybertronian': { 'FALS': { 'suggestion': 'False', + 'suggestions': ['False'], + 'suggestions_size': 1, + 'total_count': 1}, + 'TR': { 'suggestion': 'True', + 'suggestions': ['True'], 'suggestions_size': 1, - 'total_count': 6}}, - 'age': { '0050': { 'suggestion': 5000000, - 'suggestions': [5000000], - 'suggestions_size': 1, - 'total_count': 6}}, - 'date arrival': { '00041019418098': { 'suggestion': '1980/04/10', - 'suggestions': ['1980/04/10'], - 'suggestions_size': 1, - 'total_count': 6}}, - 'function': { 'addeeaerle': { 'suggestion': 'Leader', - 'suggestions': ['Leader'], + 'total_count': 5}}, + 'Date Type': { '201-04-10': { 'suggestion': '2011-04-10', + 'suggestions': ['2011-04-10'], 'suggestions_size': 1, 'total_count': 1}, - 'agesgeionaonpisp': { 'suggestion': 'Espionage', - 'suggestions': ['Espionage'], - 'suggestions_size': 1, - 'total_count': 1}, - 'aneneufiieirlinantrssttetlut': { 'suggestion': 'First ' - 'Lieutenant', - 'suggestions': [ 'First ' - 'Lieutenant'], - 'suggestions_size': 1, - 'total_count': 1}, - 'atbaesioleonsttatitltt': { 'suggestion': 'Battle Station', - 'suggestions': ['Battle Station'], - 'suggestions_size': 1, - 'total_count': 1}, - 'cuecitrisetyur': { 'suggestion': 'Security', - 'suggestions': ['Security'], + '2012-05-10': { 'suggestion': '2012-05-10', + 'suggestions': ['2012-05-10'], + 'suggestions_size': 1, + 'total_count': 1}, + '2013-06-24': { 'suggestion': '2013-06-24', + 'suggestions': ['2013-06-24'], + 'suggestions_size': 1, + 'total_count': 1}, + '2014-06-24': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24'], + 'suggestions_size': 1, + 'total_count': 1}, + '2015-08-10': { 'suggestion': '2015-08-10', + 'suggestions': ['2015-08-10'], + 'suggestions_size': 1, + 'total_count': 1}, + '2016-09-10': { 'suggestion': '2016-09-10', + 'suggestions': ['2016-09-10'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'NullType': { 'NAN': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'age': { '50': { 'suggestion': '5000000', + 'suggestions': ['5000000'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'date arrival': { '1980/04/10': { 'suggestion': '1980/04/10', + 'suggestions': ['1980/04/10'], 'suggestions_size': 1, - 'total_count': 1}, - 'nenoon': { 'suggestion': None, - 'suggestions': [None], + 'total_count': 6}}, + 'function': { 'BATL': { 'suggestion': 'Battle Station', + 'suggestions': ['Battle Station'], + 'suggestions_size': 1, + 'total_count': 1}, + 'ESPANAG': { 'suggestion': 'Espionage', + 'suggestions': ['Espionage'], + 'suggestions_size': 1, + 'total_count': 1}, + 'FARST': { 'suggestion': 'First Lieutenant', + 'suggestions': ['First Lieutenant'], + 'suggestions_size': 1, + 'total_count': 1}, + 'LADAR': { 'suggestion': 'Leader', + 'suggestions': ['Leader'], + 'suggestions_size': 1, + 'total_count': 1}, + 'NAN': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 1}, + 'SACARATY': { 'suggestion': 'Security', + 'suggestions': ['Security'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'height(ft)': { '-28.0': { 'suggestion': '-28.0', + 'suggestions': ['-28.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '13.0': { 'suggestion': '13.0', + 'suggestions': ['13.0'], 'suggestions_size': 1, - 'total_count': 1}}, - 'height(ft)': { '0030': { 'suggestion': 300.0, - 'suggestions': [300.0], + 'total_count': 1}, + '17.0': { 'suggestion': '17.0', + 'suggestions': ['17.0'], 'suggestions_size': 1, 'total_count': 1}, - '1330': { 'suggestion': 13.0, - 'suggestions': [13.0], + '26.0': { 'suggestion': '26.0', + 'suggestions': ['26.0'], 'suggestions_size': 1, 'total_count': 1}, - '1770': { 'suggestion': 17.0, - 'suggestions': [17.0], + '30.0': { 'suggestion': '300.0', + 'suggestions': ['300.0'], 'suggestions_size': 1, 'total_count': 1}, - '2660': { 'suggestion': 26.0, - 'suggestions': [26.0], + 'NAN': { 'suggestion': 'nan', + 'suggestions': ['nan'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'last date seen': { '201/04/10': { 'suggestion': '2011/04/10', + 'suggestions': ['2011/04/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '2012/05/10': { 'suggestion': '2012/05/10', + 'suggestions': ['2012/05/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '2013/06/10': { 'suggestion': '2013/06/10', + 'suggestions': ['2013/06/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '2014/07/10': { 'suggestion': '2014/07/10', + 'suggestions': ['2014/07/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '2015/08/10': { 'suggestion': '2015/08/10', + 'suggestions': ['2015/08/10'], + 'suggestions_size': 1, + 'total_count': 1}, + '2016/09/10': { 'suggestion': '2016/09/10', + 'suggestions': ['2016/09/10'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'last position seen': { '10.642707,-71.612534': { 'suggestion': '10.642707,-71.612534', + 'suggestions': [ '10.642707,-71.612534'], + 'suggestions_size': 1, + 'total_count': 1}, + '19.42735,-9.201': { 'suggestion': '19.442735,-99.201111', + 'suggestions': [ '19.442735,-99.201111'], + 'suggestions_size': 1, + 'total_count': 1}, + '3.6706,-17.84153': { 'suggestion': '33.670666,-117.841553', + 'suggestions': [ '33.670666,-117.841553'], + 'suggestions_size': 1, + 'total_count': 1}, + '37.789563,-12.40356': { 'suggestion': '37.789563,-122.400356', + 'suggestions': [ '37.789563,-122.400356'], + 'suggestions_size': 1, + 'total_count': 1}, + 'NAN': { 'suggestion': 'None', + 'suggestions': ['None'], + 'suggestions_size': 1, + 'total_count': 2}}, + 'names': { 'BANBL#ABÉ': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], 'suggestions_size': 1, 'total_count': 1}, - '2880': { 'suggestion': -28.0, - 'suggestions': [-28.0], + 'BANBLABY': { 'suggestion': 'bumblebee', + 'suggestions': ['bumblebee'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MATRAPLAX': { 'suggestion': 'Metroplex', + 'suggestions': ['Metroplex'], 'suggestions_size': 1, 'total_count': 1}, - 'anna': { 'suggestion': nan, - 'suggestions': [nan], - 'suggestions_size': 1, - 'total_count': 1}}, - 'last date seen': { '010410112041': { 'suggestion': '2011/04/10', - 'suggestions': ['2011/04/10'], - 'suggestions_size': 1, - 'total_count': 1}, - '010510122051': { 'suggestion': '2012/05/10', - 'suggestions': ['2012/05/10'], - 'suggestions_size': 1, - 'total_count': 1}, - '01061013203061': { 'suggestion': '2013/06/10', - 'suggestions': ['2013/06/10'], - 'suggestions_size': 1, - 'total_count': 1}, - '01071014204071': { 'suggestion': '2014/07/10', - 'suggestions': ['2014/07/10'], - 'suggestions_size': 1, - 'total_count': 1}, - '01081015205081': { 'suggestion': '2015/08/10', - 'suggestions': ['2015/08/10'], - 'suggestions_size': 1, - 'total_count': 1}, - '01091016206091': { 'suggestion': '2016/09/10', - 'suggestions': ['2016/09/10'], - 'suggestions_size': 1, - 'total_count': 1}}, - 'last position seen': { '000312222431353740566377788995': { 'suggestion': '37.789563,-122.400356', - 'suggestions': [ '37.789563,-122.400356'], - 'suggestions_size': 1, - 'total_count': 1}, - '01111920273542445973929499': { 'suggestion': '19.442735,-99.201111', - 'suggestions': [ '19.442735,-99.201111'], - 'suggestions_size': 1, - 'total_count': 1}, - '060710121625273442536164707177': { 'suggestion': '10.642707,-71.612534', - 'suggestions': [ '10.642707,-71.612534'], - 'suggestions_size': 1, - 'total_count': 1}, - '061115173336415355616667707884': { 'suggestion': '33.670666,-117.841553', - 'suggestions': [ '33.670666,-117.841553'], - 'suggestions_size': 1, - 'total_count': 1}, - 'nenoon': { 'suggestion': None, - 'suggestions': [None], - 'suggestions_size': 1, - 'total_count': 2}}, - 'names': { 'ateggameonrotr': { 'suggestion': 'Megatron', - 'suggestions': ['Megatron'], - 'suggestions_size': 1, - 'total_count': 1}, - 'azjazz': { 'suggestion': 'Jazz', - 'suggestions': ['Jazz'], - 'suggestions_size': 1, - 'total_count': 1}, - 'beblbuebeelembum': { 'suggestion': 'bumbl#ebéé ', - 'suggestions': ['bumbl#ebéé '], - 'suggestions_size': 1, - 'total_count': 1}, - 'dehiidirnhonro': { 'suggestion': 'ironhide&', - 'suggestions': ['ironhide&'], - 'suggestions_size': 1, - 'total_count': 1}, - 'etexlemeopplrotr': { 'suggestion': 'Metroplex_)^$', - 'suggestions': ['Metroplex_)^$'], - 'suggestions_size': 1, - 'total_count': 1}, - 'immuoppttius': { 'suggestion': 'Optimus', - 'suggestions': ['Optimus'], - 'suggestions_size': 1, - 'total_count': 1}}, - 'rank': { '': { 'suggestion': 7, - 'suggestions': [7, 8], - 'suggestions_size': 2, - 'total_count': 4}, - '10': { 'suggestion': 10, - 'suggestions': [10], + 'MATRAPLAX_)^$': { 'suggestion': 'Metroplex_)^$', + 'suggestions': ['Metroplex_)^$'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MÉTRAP´LA-X': { 'suggestion': 'métrop´le-x', + 'suggestions': ['métrop´le-x'], + 'suggestions_size': 1, + 'total_count': 1}, + 'OPTAN': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}, + 'rank': { '10': { 'suggestion': '10', + 'suggestions': ['10'], 'suggestions_size': 1, - 'total_count': 2}}, - 'timestamp': { '01061420244062': { 'suggestion': Timestamp('2014-06-24 00:00:00'), - 'suggestions': [ Timestamp('2014-06-24 00:00:00')], - 'suggestions_size': 1, - 'total_count': 6}}, - 'weight(t)': { '18': { 'suggestion': 1.8, - 'suggestions': [1.8], - 'suggestions_size': 1, - 'total_count': 1}, - '20': { 'suggestion': 2.0, - 'suggestions': [2.0], - 'suggestions_size': 1, - 'total_count': 1}, - '40': { 'suggestion': 4.0, - 'suggestions': [4.0], - 'suggestions_size': 1, - 'total_count': 1}, - '43': { 'suggestion': 4.3, - 'suggestions': [4.3], - 'suggestions_size': 1, - 'total_count': 1}, - '57': { 'suggestion': 5.7, - 'suggestions': [5.7], - 'suggestions_size': 1, - 'total_count': 1}, - 'anna': { 'suggestion': nan, - 'suggestions': [nan], - 'suggestions_size': 1, - 'total_count': 1}}} + 'total_count': 2}, + '7': { 'suggestion': '7', + 'suggestions': ['7'], + 'suggestions_size': 1, + 'total_count': 2}, + '8': { 'suggestion': '8', + 'suggestions': ['8'], + 'suggestions_size': 1, + 'total_count': 2}}, + 'timestamp': { '2014-06-24': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24'], + 'suggestions_size': 1, + 'total_count': 6}}, + 'weight(t)': { '1.8': { 'suggestion': '1.8', + 'suggestions': ['1.8'], + 'suggestions_size': 1, + 'total_count': 1}, + '2.0': { 'suggestion': '2.0', + 'suggestions': ['2.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '4.0': { 'suggestion': '4.0', + 'suggestions': ['4.0'], + 'suggestions_size': 1, + 'total_count': 1}, + '4.3': { 'suggestion': '4.3', + 'suggestions': ['4.3'], + 'suggestions_size': 1, + 'total_count': 1}, + '5.7': { 'suggestion': '5.7', + 'suggestions': ['5.7'], + 'suggestions_size': 1, + 'total_count': 1}, + 'NAN': { 'suggestion': 'nan', + 'suggestions': ['nan'], + 'suggestions_size': 1, + 'total_count': 1}}} self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) - def test_string_clustering_all_nysiis(self): - df = self.df - result = df.string_clustering(cols='*', algorithm='nysiis') - # The following value does not represent a correct output of the operation - expected = self.dict - self.assertTrue(result.equals(expected, decimal=True, assertion=True)) - def test_string_clustering_all_soundex(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols='*', algorithm='soundex') - expected = { 'Cybertronian': { 'F420': { 'suggestion': False, - 'suggestions': [False], + expected = { 'Cybertronian': { 'F420': { 'suggestion': 'False', + 'suggestions': ['False'], 'suggestions_size': 1, 'total_count': 1}, - 'T600': { 'suggestion': True, - 'suggestions': [True], + 'T600': { 'suggestion': 'True', + 'suggestions': ['True'], 'suggestions_size': 1, 'total_count': 5}}, - 'Date Type': { '2000': { 'suggestion': Timestamp('2016-09-10 00:00:00'), - 'suggestions': [ Timestamp('2016-09-10 00:00:00'), - Timestamp('2015-08-10 00:00:00'), - Timestamp('2014-06-24 00:00:00'), - Timestamp('2013-06-24 00:00:00'), - Timestamp('2012-05-10 00:00:00'), - Timestamp('2011-04-10 00:00:00')], + 'Date Type': { '2000': { 'suggestion': '2016-09-10', + 'suggestions': [ '2016-09-10', + '2015-08-10', + '2014-06-24', + '2013-06-24', + '2012-05-10', + '2011-04-10'], 'suggestions_size': 6, 'total_count': 6}}, - 'NullType': { 'N500': { 'suggestion': None, - 'suggestions': [None], + 'NullType': { 'N500': { 'suggestion': 'None', + 'suggestions': ['None'], 'suggestions_size': 1, 'total_count': 6}}, - 'age': { '5000': { 'suggestion': 5000000, - 'suggestions': [5000000], + 'age': { '5000': { 'suggestion': '5000000', + 'suggestions': ['5000000'], 'suggestions_size': 1, 'total_count': 6}}, 'date arrival': { '1000': { 'suggestion': '1980/04/10', @@ -768,32 +843,32 @@ def test_string_clustering_all_soundex(self): 'suggestions': ['Leader'], 'suggestions_size': 1, 'total_count': 1}, - 'N500': { 'suggestion': None, - 'suggestions': [None], + 'N500': { 'suggestion': 'None', + 'suggestions': ['None'], 'suggestions_size': 1, 'total_count': 1}, 'S263': { 'suggestion': 'Security', 'suggestions': ['Security'], 'suggestions_size': 1, 'total_count': 1}}, - 'height(ft)': { '-000': { 'suggestion': -28.0, - 'suggestions': [-28.0], + 'height(ft)': { '-000': { 'suggestion': '-28.0', + 'suggestions': ['-28.0'], 'suggestions_size': 1, 'total_count': 1}, - '1000': { 'suggestion': 17.0, - 'suggestions': [17.0, 13.0], + '1000': { 'suggestion': '17.0', + 'suggestions': ['17.0', '13.0'], 'suggestions_size': 2, 'total_count': 2}, - '2000': { 'suggestion': 26.0, - 'suggestions': [26.0], + '2000': { 'suggestion': '26.0', + 'suggestions': ['26.0'], 'suggestions_size': 1, 'total_count': 1}, - '3000': { 'suggestion': 300.0, - 'suggestions': [300.0], + '3000': { 'suggestion': '300.0', + 'suggestions': ['300.0'], 'suggestions_size': 1, 'total_count': 1}, - 'N500': { 'suggestion': nan, - 'suggestions': [nan], + 'N500': { 'suggestion': 'nan', + 'suggestions': ['nan'], 'suggestions_size': 1, 'total_count': 1}}, 'last date seen': { '2000': { 'suggestion': '2016/09/10', @@ -815,526 +890,386 @@ def test_string_clustering_all_soundex(self): '33.670666,-117.841553'], 'suggestions_size': 2, 'total_count': 2}, - 'N500': { 'suggestion': None, - 'suggestions': [None], + 'N500': { 'suggestion': 'None', + 'suggestions': ['None'], 'suggestions_size': 1, 'total_count': 2}}, 'names': { 'B514': { 'suggestion': 'bumbl#ebéé ', - 'suggestions': ['bumbl#ebéé '], - 'suggestions_size': 1, - 'total_count': 1}, - 'I653': { 'suggestion': 'ironhide&', - 'suggestions': ['ironhide&'], - 'suggestions_size': 1, - 'total_count': 1}, - 'J200': { 'suggestion': 'Jazz', - 'suggestions': ['Jazz'], - 'suggestions_size': 1, - 'total_count': 1}, - 'M236': { 'suggestion': 'Megatron', - 'suggestions': ['Megatron'], - 'suggestions_size': 1, - 'total_count': 1}, - 'M361': { 'suggestion': 'Metroplex_)^$', - 'suggestions': ['Metroplex_)^$'], - 'suggestions_size': 1, - 'total_count': 1}, + 'suggestions': ['bumbl#ebéé ', 'bumblebee'], + 'suggestions_size': 2, + 'total_count': 2}, + 'M361': { 'suggestion': 'Metroplex', + 'suggestions': [ 'Metroplex', + 'métrop´le-x', + 'Metroplex_)^$'], + 'suggestions_size': 3, + 'total_count': 3}, 'O135': { 'suggestion': 'Optimus', 'suggestions': ['Optimus'], 'suggestions_size': 1, 'total_count': 1}}, - 'rank': { '1000': { 'suggestion': 10, - 'suggestions': [10], + 'rank': { '1000': { 'suggestion': '10', + 'suggestions': ['10'], 'suggestions_size': 1, 'total_count': 2}, - '7000': { 'suggestion': 7, - 'suggestions': [7], + '7000': { 'suggestion': '7', + 'suggestions': ['7'], 'suggestions_size': 1, 'total_count': 2}, - '8000': { 'suggestion': 8, - 'suggestions': [8], + '8000': { 'suggestion': '8', + 'suggestions': ['8'], 'suggestions_size': 1, 'total_count': 2}}, - 'timestamp': { '2000': { 'suggestion': Timestamp('2014-06-24 00:00:00'), - 'suggestions': [Timestamp('2014-06-24 00:00:00')], + 'timestamp': { '2000': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24'], 'suggestions_size': 1, 'total_count': 6}}, - 'weight(t)': { '1000': { 'suggestion': 1.8, - 'suggestions': [1.8], + 'weight(t)': { '1000': { 'suggestion': '1.8', + 'suggestions': ['1.8'], 'suggestions_size': 1, 'total_count': 1}, - '2000': { 'suggestion': 2.0, - 'suggestions': [2.0], + '2000': { 'suggestion': '2.0', + 'suggestions': ['2.0'], 'suggestions_size': 1, 'total_count': 1}, - '4000': { 'suggestion': 4.3, - 'suggestions': [4.3, 4.0], + '4000': { 'suggestion': '4.3', + 'suggestions': ['4.3', '4.0'], 'suggestions_size': 2, 'total_count': 2}, - '5000': { 'suggestion': 5.7, - 'suggestions': [5.7], + '5000': { 'suggestion': '5.7', + 'suggestions': ['5.7'], 'suggestions_size': 1, 'total_count': 1}, - 'N500': { 'suggestion': nan, - 'suggestions': [nan], + 'N500': { 'suggestion': 'nan', + 'suggestions': ['nan'], 'suggestions_size': 1, 'total_count': 1}}} self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) def test_string_clustering_multiple_double_metaphone(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='double_metaphone') - expected = { 'Cybertronian': { ('FLS', ''): { 'suggestion': False, - 'suggestions': [False], - 'suggestions_size': 1, - 'total_count': 1}, - ('TR', ''): { 'suggestion': True, - 'suggestions': [True], - 'suggestions_size': 1, - 'total_count': 5}}, - 'NullType': { ('NN', ''): { 'suggestion': None, - 'suggestions': [None], - 'suggestions_size': 1, - 'total_count': 6}}, - 'timestamp': { ('', ''): { 'suggestion': Timestamp('2014-06-24 00:00:00'), - 'suggestions': [Timestamp('2014-06-24 00:00:00')], - 'suggestions_size': 1, - 'total_count': 6}}} - self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) def test_string_clustering_multiple_fingerprint(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='fingerprint') - expected = { 'Cybertronian': { 'false': { 'suggestion': False, - 'suggestions': [False], - 'suggestions_size': 1, - 'total_count': 1}, - 'true': { 'suggestion': True, - 'suggestions': [True], - 'suggestions_size': 1, - 'total_count': 5}}, - 'NullType': { 'none': { 'suggestion': None, - 'suggestions': [None], - 'suggestions_size': 1, - 'total_count': 6}}, - 'timestamp': { '20140624': { 'suggestion': Timestamp('2014-06-24 00:00:00'), - 'suggestions': [ Timestamp('2014-06-24 00:00:00')], - 'suggestions_size': 1, - 'total_count': 6}}} - self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) def test_string_clustering_multiple_levenshtein(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='levenshtein') - expected = { 'timestamp': { '20140624': { 'suggestion': '20140624', - 'suggestions': ['20140624'], - 'suggestions_size': 1, - 'total_count': 6}}} - self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) def test_string_clustering_multiple_match_rating_codex(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='match_rating_codex') - expected = { 'Cybertronian': { 'FLS': { 'suggestion': False, - 'suggestions': [False], + expected = { 'Cybertronian': { 'FLS': { 'suggestion': 'False', + 'suggestions': ['False'], 'suggestions_size': 1, 'total_count': 1}, - 'TR': { 'suggestion': True, - 'suggestions': [True], + 'TR': { 'suggestion': 'True', + 'suggestions': ['True'], 'suggestions_size': 1, 'total_count': 5}}, - 'NullType': { 'N': { 'suggestion': None, - 'suggestions': [None], + 'NullType': { 'N': { 'suggestion': 'None', + 'suggestions': ['None'], 'suggestions_size': 1, 'total_count': 6}}, - 'timestamp': { '201-24': { 'suggestion': Timestamp('2014-06-24 00:00:00'), - 'suggestions': [Timestamp('2014-06-24 00:00:00')], + 'timestamp': { '201-24': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24'], 'suggestions_size': 1, 'total_count': 6}}} self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) def test_string_clustering_multiple_metaphone(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='metaphone') - expected = { 'Cybertronian': { 'FLS': { 'suggestion': False, - 'suggestions': [False], + expected = { 'Cybertronian': { 'FLS': { 'suggestion': 'False', + 'suggestions': ['False'], 'suggestions_size': 1, 'total_count': 1}, - 'TR': { 'suggestion': True, - 'suggestions': [True], + 'TR': { 'suggestion': 'True', + 'suggestions': ['True'], 'suggestions_size': 1, 'total_count': 5}}, - 'NullType': { 'NN': { 'suggestion': None, - 'suggestions': [None], + 'NullType': { 'NN': { 'suggestion': 'None', + 'suggestions': ['None'], 'suggestions_size': 1, 'total_count': 6}}, - 'timestamp': { '': { 'suggestion': Timestamp('2014-06-24 00:00:00'), - 'suggestions': [Timestamp('2014-06-24 00:00:00')], + 'timestamp': { '': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24'], 'suggestions_size': 1, 'total_count': 6}}} self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) def test_string_clustering_multiple_ngram_fingerprint(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='ngram_fingerprint') - expected = { 'Cybertronian': { 'alfalsse': { 'suggestion': False, - 'suggestions': [False], - 'suggestions_size': 1, - 'total_count': 1}, - 'rutrue': { 'suggestion': True, - 'suggestions': [True], - 'suggestions_size': 1, - 'total_count': 5}}, - 'NullType': { 'nenoon': { 'suggestion': None, - 'suggestions': [None], - 'suggestions_size': 1, - 'total_count': 6}}, - 'timestamp': { '01061420244062': { 'suggestion': Timestamp('2014-06-24 00:00:00'), - 'suggestions': [ Timestamp('2014-06-24 00:00:00')], - 'suggestions_size': 1, - 'total_count': 6}}} - self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) def test_string_clustering_multiple_nysiis(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='nysiis') - expected = { 'Cybertronian': { 'FALS': { 'suggestion': False, - 'suggestions': [False], + expected = { 'Cybertronian': { 'FALS': { 'suggestion': 'False', + 'suggestions': ['False'], 'suggestions_size': 1, 'total_count': 1}, - 'TR': { 'suggestion': True, - 'suggestions': [True], + 'TR': { 'suggestion': 'True', + 'suggestions': ['True'], 'suggestions_size': 1, 'total_count': 5}}, - 'NullType': { 'NAN': { 'suggestion': None, - 'suggestions': [None], + 'NullType': { 'NAN': { 'suggestion': 'None', + 'suggestions': ['None'], 'suggestions_size': 1, 'total_count': 6}}, - 'timestamp': { '2014-06-24': { 'suggestion': Timestamp('2014-06-24 00:00:00'), - 'suggestions': [ Timestamp('2014-06-24 00:00:00')], + 'timestamp': { '2014-06-24': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24'], 'suggestions_size': 1, 'total_count': 6}}} self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) def test_string_clustering_multiple_soundex(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols=['NullType', 'Cybertronian', 'timestamp'], algorithm='soundex') - expected = { 'Cybertronian': { 'F420': { 'suggestion': False, - 'suggestions': [False], + expected = { 'Cybertronian': { 'F420': { 'suggestion': 'False', + 'suggestions': ['False'], 'suggestions_size': 1, 'total_count': 1}, - 'T600': { 'suggestion': True, - 'suggestions': [True], + 'T600': { 'suggestion': 'True', + 'suggestions': ['True'], 'suggestions_size': 1, 'total_count': 5}}, - 'NullType': { 'N500': { 'suggestion': None, - 'suggestions': [None], + 'NullType': { 'N500': { 'suggestion': 'None', + 'suggestions': ['None'], 'suggestions_size': 1, 'total_count': 6}}, - 'timestamp': { '2000': { 'suggestion': Timestamp('2014-06-24 00:00:00'), - 'suggestions': [Timestamp('2014-06-24 00:00:00')], + 'timestamp': { '2000': { 'suggestion': '2014-06-24', + 'suggestions': ['2014-06-24'], 'suggestions_size': 1, 'total_count': 6}}} self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) def test_string_clustering_numeric_double_metaphone(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols=['rank'], algorithm='double_metaphone') - expected = { 'rank': { ('', ''): { 'suggestion': 10, - 'suggestions': [10, 7, 8], - 'suggestions_size': 3, - 'total_count': 6}}} - self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) def test_string_clustering_numeric_fingerprint(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols=['rank'], algorithm='fingerprint') - expected = { 'rank': { '10': { 'suggestion': 10, - 'suggestions': [10], - 'suggestions_size': 1, - 'total_count': 2}, - '7': { 'suggestion': 7, - 'suggestions': [7], - 'suggestions_size': 1, - 'total_count': 2}, - '8': { 'suggestion': 8, - 'suggestions': [8], - 'suggestions_size': 1, - 'total_count': 2}}} - self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) def test_string_clustering_numeric_levenshtein(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols=['rank'], algorithm='levenshtein') + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + + def test_string_clustering_numeric_match_rating_codex(self): + df = self.df.copy() + result = df.string_clustering(cols=['rank'], algorithm='match_rating_codex') expected = { 'rank': { '10': { 'suggestion': '10', 'suggestions': ['10'], 'suggestions_size': 1, - 'total_count': 6}, + 'total_count': 2}, '7': { 'suggestion': '7', 'suggestions': ['7'], 'suggestions_size': 1, - 'total_count': 6}, + 'total_count': 2}, '8': { 'suggestion': '8', 'suggestions': ['8'], 'suggestions_size': 1, - 'total_count': 6}}} - self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) - - def test_string_clustering_numeric_match_rating_codex(self): - df = self.df - result = df.string_clustering(cols=['rank'], algorithm='match_rating_codex') - expected = { 'rank': { '10': { 'suggestion': 10, - 'suggestions': [10], - 'suggestions_size': 1, - 'total_count': 2}, - '7': { 'suggestion': 7, - 'suggestions': [7], - 'suggestions_size': 1, - 'total_count': 2}, - '8': { 'suggestion': 8, - 'suggestions': [8], - 'suggestions_size': 1, 'total_count': 2}}} self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) def test_string_clustering_numeric_metaphone(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols=['rank'], algorithm='metaphone') - expected = { 'rank': { '': { 'suggestion': 10, - 'suggestions': [10, 7, 8], + expected = { 'rank': { '': { 'suggestion': '10', + 'suggestions': ['10', '7', '8'], 'suggestions_size': 3, 'total_count': 6}}} self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) def test_string_clustering_numeric_ngram_fingerprint(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols=['rank'], algorithm='ngram_fingerprint') - expected = { 'rank': { '': { 'suggestion': 7, - 'suggestions': [7, 8], - 'suggestions_size': 2, - 'total_count': 4}, - '10': { 'suggestion': 10, - 'suggestions': [10], - 'suggestions_size': 1, - 'total_count': 2}}} - self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) def test_string_clustering_numeric_nysiis(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols=['rank'], algorithm='nysiis') - expected = { 'rank': { '10': { 'suggestion': 10, - 'suggestions': [10], + expected = { 'rank': { '10': { 'suggestion': '10', + 'suggestions': ['10'], 'suggestions_size': 1, 'total_count': 2}, - '7': { 'suggestion': 7, - 'suggestions': [7], + '7': { 'suggestion': '7', + 'suggestions': ['7'], 'suggestions_size': 1, 'total_count': 2}, - '8': { 'suggestion': 8, - 'suggestions': [8], + '8': { 'suggestion': '8', + 'suggestions': ['8'], 'suggestions_size': 1, 'total_count': 2}}} self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) def test_string_clustering_numeric_soundex(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols=['rank'], algorithm='soundex') - expected = { 'rank': { '1000': { 'suggestion': 10, - 'suggestions': [10], + expected = { 'rank': { '1000': { 'suggestion': '10', + 'suggestions': ['10'], 'suggestions_size': 1, 'total_count': 2}, - '7000': { 'suggestion': 7, - 'suggestions': [7], + '7000': { 'suggestion': '7', + 'suggestions': ['7'], 'suggestions_size': 1, 'total_count': 2}, - '8000': { 'suggestion': 8, - 'suggestions': [8], + '8000': { 'suggestion': '8', + 'suggestions': ['8'], 'suggestions_size': 1, 'total_count': 2}}} self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) def test_string_clustering_string_double_metaphone(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols=['names'], algorithm='double_metaphone') - expected = { 'names': { ('APTMS', ''): { 'suggestion': 'Optimus', - 'suggestions': ['Optimus'], - 'suggestions_size': 1, - 'total_count': 1}, - ('ARNT', ''): { 'suggestion': 'ironhide&', - 'suggestions': ['ironhide&'], - 'suggestions_size': 1, - 'total_count': 1}, - ('JS', 'AS'): { 'suggestion': 'Jazz', - 'suggestions': ['Jazz'], - 'suggestions_size': 1, - 'total_count': 1}, - ('MKTRN', ''): { 'suggestion': 'Megatron', - 'suggestions': ['Megatron'], - 'suggestions_size': 1, - 'total_count': 1}, - ('MTRPLKSKSKSKSKS', ''): { 'suggestion': 'Metroplex_)^$', - 'suggestions': ['Metroplex_)^$'], - 'suggestions_size': 1, - 'total_count': 1}, - ('PMPLLP', ''): { 'suggestion': 'bumbl#ebéé ', - 'suggestions': ['bumbl#ebéé '], - 'suggestions_size': 1, - 'total_count': 1}}} - self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) def test_string_clustering_string_fingerprint(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols=['names'], algorithm='fingerprint') - expected = { 'names': { 'bumblebee': { 'suggestion': 'bumbl#ebéé ', - 'suggestions': ['bumbl#ebéé '], - 'suggestions_size': 1, - 'total_count': 1}, - 'ironhide': { 'suggestion': 'ironhide&', - 'suggestions': ['ironhide&'], - 'suggestions_size': 1, - 'total_count': 1}, - 'jazz': { 'suggestion': 'Jazz', - 'suggestions': ['Jazz'], - 'suggestions_size': 1, - 'total_count': 1}, - 'megatron': { 'suggestion': 'Megatron', - 'suggestions': ['Megatron'], - 'suggestions_size': 1, - 'total_count': 1}, - 'metroplex': { 'suggestion': 'Metroplex_)^$', - 'suggestions': ['Metroplex_)^$'], - 'suggestions_size': 1, - 'total_count': 1}, - 'optimus': { 'suggestion': 'Optimus', - 'suggestions': ['Optimus'], - 'suggestions_size': 1, - 'total_count': 1}}} - self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + # The following value does not represent a correct output of the operation + expected = self.dict + self.assertTrue(result.equals(expected, decimal=True, assertion=True)) def test_string_clustering_string_levenshtein(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols=['names'], algorithm='levenshtein') - expected = { 'names': { 'bumblebee': { 'suggestion': 'bumblebee', - 'suggestions': ['bumblebee', 'ironhide'], - 'suggestions_size': 2, - 'total_count': 6}, - 'ironhide': { 'suggestion': 'ironhide', - 'suggestions': ['ironhide', 'optimus'], - 'suggestions_size': 2, - 'total_count': 6}, - 'jazz': { 'suggestion': 'jazz', - 'suggestions': ['jazz', 'optimus'], - 'suggestions_size': 2, - 'total_count': 6}, - 'megatron': { 'suggestion': 'megatron', - 'suggestions': ['megatron', 'metroplex'], - 'suggestions_size': 2, - 'total_count': 6}, - 'metroplex': { 'suggestion': 'metroplex', - 'suggestions': ['metroplex', 'megatron'], - 'suggestions_size': 2, - 'total_count': 6}, - 'optimus': { 'suggestion': 'optimus', - 'suggestions': ['optimus', 'ironhide'], - 'suggestions_size': 2, - 'total_count': 6}}} - self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) - - def test_string_clustering_string_match_rating_codex(self): - df = self.df - result = df.string_clustering(cols=['names'], algorithm='match_rating_codex') # The following value does not represent a correct output of the operation expected = self.dict self.assertTrue(result.equals(expected, decimal=True, assertion=True)) - def test_string_clustering_string_metaphone(self): - df = self.df - result = df.string_clustering(cols=['names'], algorithm='metaphone') - expected = { 'names': { 'BMBLB ': { 'suggestion': 'bumbl#ebéé ', + def test_string_clustering_string_match_rating_codex(self): + df = self.df.copy() + result = df.string_clustering(cols=['names'], algorithm='match_rating_codex') + expected = { 'names': { 'BMB#BÉ': { 'suggestion': 'bumbl#ebéé ', 'suggestions': ['bumbl#ebéé '], 'suggestions_size': 1, 'total_count': 1}, - 'IRNHT': { 'suggestion': 'ironhide&', - 'suggestions': ['ironhide&'], + 'BMBLB': { 'suggestion': 'bumblebee', + 'suggestions': ['bumblebee'], 'suggestions_size': 1, 'total_count': 1}, - 'JS': { 'suggestion': 'Jazz', - 'suggestions': ['Jazz'], - 'suggestions_size': 1, - 'total_count': 1}, - 'MKTRN': { 'suggestion': 'Megatron', - 'suggestions': ['Megatron'], - 'suggestions_size': 1, - 'total_count': 1}, - 'MTRPLKS': { 'suggestion': 'Metroplex_)^$', - 'suggestions': ['Metroplex_)^$'], - 'suggestions_size': 1, - 'total_count': 1}, + 'MTR)^$': { 'suggestion': 'Metroplex_)^$', + 'suggestions': ['Metroplex_)^$'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MTRPLX': { 'suggestion': 'Metroplex', + 'suggestions': ['Metroplex'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MÉTL-X': { 'suggestion': 'métrop´le-x', + 'suggestions': ['métrop´le-x'], + 'suggestions_size': 1, + 'total_count': 1}, 'OPTMS': { 'suggestion': 'Optimus', 'suggestions': ['Optimus'], 'suggestions_size': 1, 'total_count': 1}}} self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) - def test_string_clustering_string_ngram_fingerprint(self): - df = self.df - result = df.string_clustering(cols=['names'], algorithm='ngram_fingerprint') - expected = { 'names': { 'ateggameonrotr': { 'suggestion': 'Megatron', - 'suggestions': ['Megatron'], - 'suggestions_size': 1, - 'total_count': 1}, - 'azjazz': { 'suggestion': 'Jazz', - 'suggestions': ['Jazz'], + def test_string_clustering_string_metaphone(self): + df = self.df.copy() + result = df.string_clustering(cols=['names'], algorithm='metaphone') + expected = { 'names': { 'BMBLB': { 'suggestion': 'bumblebee', + 'suggestions': ['bumblebee'], + 'suggestions_size': 1, + 'total_count': 1}, + 'BMBLB ': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], 'suggestions_size': 1, 'total_count': 1}, - 'beblbuebeelembum': { 'suggestion': 'bumbl#ebéé ', - 'suggestions': ['bumbl#ebéé '], - 'suggestions_size': 1, - 'total_count': 1}, - 'dehiidirnhonro': { 'suggestion': 'ironhide&', - 'suggestions': ['ironhide&'], - 'suggestions_size': 1, - 'total_count': 1}, - 'etexlemeopplrotr': { 'suggestion': 'Metroplex_)^$', - 'suggestions': ['Metroplex_)^$'], - 'suggestions_size': 1, - 'total_count': 1}, - 'immuoppttius': { 'suggestion': 'Optimus', - 'suggestions': ['Optimus'], - 'suggestions_size': 1, - 'total_count': 1}}} + 'MTRP LKS': { 'suggestion': 'métrop´le-x', + 'suggestions': ['métrop´le-x'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MTRPLKS': { 'suggestion': 'Metroplex', + 'suggestions': ['Metroplex', 'Metroplex_)^$'], + 'suggestions_size': 2, + 'total_count': 2}, + 'OPTMS': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}} self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) - def test_string_clustering_string_nysiis(self): - df = self.df - result = df.string_clustering(cols=['names'], algorithm='nysiis') + def test_string_clustering_string_ngram_fingerprint(self): + df = self.df.copy() + result = df.string_clustering(cols=['names'], algorithm='ngram_fingerprint') # The following value does not represent a correct output of the operation expected = self.dict self.assertTrue(result.equals(expected, decimal=True, assertion=True)) + def test_string_clustering_string_nysiis(self): + df = self.df.copy() + result = df.string_clustering(cols=['names'], algorithm='nysiis') + expected = { 'names': { 'BANBL#ABÉ': { 'suggestion': 'bumbl#ebéé ', + 'suggestions': ['bumbl#ebéé '], + 'suggestions_size': 1, + 'total_count': 1}, + 'BANBLABY': { 'suggestion': 'bumblebee', + 'suggestions': ['bumblebee'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MATRAPLAX': { 'suggestion': 'Metroplex', + 'suggestions': ['Metroplex'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MATRAPLAX_)^$': { 'suggestion': 'Metroplex_)^$', + 'suggestions': ['Metroplex_)^$'], + 'suggestions_size': 1, + 'total_count': 1}, + 'MÉTRAP´LA-X': { 'suggestion': 'métrop´le-x', + 'suggestions': ['métrop´le-x'], + 'suggestions_size': 1, + 'total_count': 1}, + 'OPTAN': { 'suggestion': 'Optimus', + 'suggestions': ['Optimus'], + 'suggestions_size': 1, + 'total_count': 1}}} + self.assertTrue(results_equal(result, expected, decimal=5, assertion=True)) + def test_string_clustering_string_soundex(self): - df = self.df + df = self.df.copy() result = df.string_clustering(cols=['names'], algorithm='soundex') expected = { 'names': { 'B514': { 'suggestion': 'bumbl#ebéé ', - 'suggestions': ['bumbl#ebéé '], - 'suggestions_size': 1, - 'total_count': 1}, - 'I653': { 'suggestion': 'ironhide&', - 'suggestions': ['ironhide&'], - 'suggestions_size': 1, - 'total_count': 1}, - 'J200': { 'suggestion': 'Jazz', - 'suggestions': ['Jazz'], - 'suggestions_size': 1, - 'total_count': 1}, - 'M236': { 'suggestion': 'Megatron', - 'suggestions': ['Megatron'], - 'suggestions_size': 1, - 'total_count': 1}, - 'M361': { 'suggestion': 'Metroplex_)^$', - 'suggestions': ['Metroplex_)^$'], - 'suggestions_size': 1, - 'total_count': 1}, + 'suggestions': ['bumbl#ebéé ', 'bumblebee'], + 'suggestions_size': 2, + 'total_count': 2}, + 'M361': { 'suggestion': 'Metroplex', + 'suggestions': [ 'Metroplex', + 'métrop´le-x', + 'Metroplex_)^$'], + 'suggestions_size': 3, + 'total_count': 3}, 'O135': { 'suggestion': 'Optimus', 'suggestions': ['Optimus'], 'suggestions_size': 1,