From c2490d1ed5e8aa4ca360894865e7c923c30fed98 Mon Sep 17 00:00:00 2001 From: cquest Date: Wed, 14 Feb 2018 23:16:59 +0100 Subject: [PATCH 01/15] more cleanup BP/CS/TSA/CIDEX/CEDEX --- addok_france/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index 779ab89..c3b01eb 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -51,8 +51,9 @@ def clean_query(q): q = re.sub(r'([\d]{5})', r' \1 ', q, flags=re.IGNORECASE) + q = re.sub(r'(^| )(b\.?p\.?|cs|tsa|cidex) *(n *|no *|)[\d]* ?', '', q, flags=re.IGNORECASE) + q = re.sub(r'([\d]{2})[\d]{3}(.*)c(e|é)dex ?[\d]*', r'\1\2', q, flags=re.IGNORECASE) q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE) - q = re.sub(r'\b(bp|cs|tsa|cidex) *[\d]*', '', q, flags=re.IGNORECASE) q = re.sub('\d{,2}(e|[eè]me) ([eé]tage)', '', q, flags=re.IGNORECASE) q = re.sub(' {2,}', ' ', q, flags=re.IGNORECASE) q = re.sub('[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE) From 253cb541fdd32defb4070ce496ecaf15d42ebf74 Mon Sep 17 00:00:00 2001 From: cquest Date: Wed, 14 Feb 2018 23:35:42 +0100 Subject: [PATCH 02/15] =?UTF-8?q?cleanup=20BP/CS/TSA/CIDEX=20N=C2=B0=20+?= =?UTF-8?q?=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- addok_france/utils.py | 5 ++++- tests/test_utils.py | 20 +++++++++++++++----- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index c3b01eb..093fbce 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -51,8 +51,11 @@ def clean_query(q): q = re.sub(r'([\d]{5})', r' \1 ', q, flags=re.IGNORECASE) - q = re.sub(r'(^| )(b\.?p\.?|cs|tsa|cidex) *(n *|no *|)[\d]* ?', '', q, flags=re.IGNORECASE) + print(q) + q = re.sub(r'(^| )(b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *', r'\1', q, flags=re.IGNORECASE) + print(q) q = re.sub(r'([\d]{2})[\d]{3}(.*)c(e|é)dex ?[\d]*', r'\1\2', q, flags=re.IGNORECASE) + print(q) q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE) q = re.sub('\d{,2}(e|[eè]me) ([eé]tage)', '', q, flags=re.IGNORECASE) q = re.sub(' {2,}', ' ', q, flags=re.IGNORECASE) diff --git a/tests/test_utils.py b/tests/test_utils.py index 969ee91..9ea13cc 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -13,13 +13,17 @@ @pytest.mark.parametrize("input,expected", [ ("2 allée Jules Guesde 31068 TOULOUSE CEDEX 7", - "2 allée Jules Guesde 31068 TOULOUSE"), + "2 allée Jules Guesde 31 TOULOUSE"), ("7, avenue Léon-Blum 31507 Toulouse Cedex 5", - "7, avenue Léon-Blum 31507 Toulouse"), + "7, avenue Léon-Blum 31 Toulouse"), ("159, avenue Jacques-Douzans 31604 Muret Cedex", - "159, avenue Jacques-Douzans 31604 Muret"), + "159, avenue Jacques-Douzans 31 Muret"), ("2 allée Jules Guesde BP 7015 31068 TOULOUSE", "2 allée Jules Guesde 31068 TOULOUSE"), + ("2 allée Jules Guesde B.P. 7015 31068 TOULOUSE", + "2 allée Jules Guesde 31068 TOULOUSE"), + ("2 allée Jules Guesde B.P. N 7015 31068 TOULOUSE", + "2 allée Jules Guesde 31068 TOULOUSE"), ("BP 80111 159, avenue Jacques-Douzans 31604 Muret", "159, avenue Jacques-Douzans 31604 Muret"), ("12, place de l'Hôtel-de-Ville BP 46 02150 Sissonne", @@ -27,7 +31,7 @@ ("6, rue Winston-Churchill CS 40055 60321 Compiègne", "6, rue Winston-Churchill 60321 Compiègne"), ("BP 80111 159, avenue Jacques-Douzans 31604 Muret Cedex", - "159, avenue Jacques-Douzans 31604 Muret"), + "159, avenue Jacques-Douzans 31 Muret"), ("BP 20169 Cite administrative - 8e étage Rue Gustave-Delory 59017 Lille", "Cite administrative - Rue Gustave-Delory 59017 Lille"), ("12e étage Rue Gustave-Delory 59017 Lille", @@ -52,9 +56,15 @@ ("32bis Rue des Vosges93290", "32bis Rue des Vosges 93290"), ("20 avenue de Ségur TSA 30719 75334 Paris Cedex 07", - "20 avenue de Ségur 75334 Paris"), + "20 avenue de Ségur 75 Paris"), + ("20 avenue de Ségur TSA No30719 75334 Paris Cedex 07", + "20 avenue de Ségur 75 Paris"), + ("20 avenue de Ségur TSA N 30719 75334 Paris Cedex 07", + "20 avenue de Ségur 75 Paris"), ("20 rue saint germain CIDEX 304 89110 Poilly-sur-tholon", "20 rue saint germain 89110 Poilly-sur-tholon"), + ("20 rue saint germain CIDEX N°304 89110 Poilly-sur-tholon", + "20 rue saint germain 89110 Poilly-sur-tholon"), ]) def test_clean_query(input, expected): assert clean_query(input) == expected From 3f75ccfc8c15d27a70e602f60ef29d03fa154fa2 Mon Sep 17 00:00:00 2001 From: cquest Date: Wed, 14 Feb 2018 23:36:22 +0100 Subject: [PATCH 03/15] do not break on queries like "12bis" --- addok_france/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index 093fbce..28efe54 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -120,14 +120,17 @@ def flag_housenumber(tokens): def fold_ordinal(s): """3bis => 3b.""" - if s[0].isdigit() and not s.isdigit(): + if s is not None and s !='' and s[0].isdigit() and not s.isdigit(): try: number, ordinal = FOLD_PATTERN.findall(s)[0] except (IndexError, ValueError): pass else: - s = s.update('{}{}'.format(number, + try: + s = s.update('{}{}'.format(number, FOLD.get(ordinal.lower(), ordinal))) + except: + pass return s From 6d59df33d5dfb4b7fc09da4f4c2abf2844c46701 Mon Sep 17 00:00:00 2001 From: cquest Date: Wed, 14 Feb 2018 23:38:36 +0100 Subject: [PATCH 04/15] print() removed --- addok_france/utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index 28efe54..265cc01 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -51,11 +51,8 @@ def clean_query(q): q = re.sub(r'([\d]{5})', r' \1 ', q, flags=re.IGNORECASE) - print(q) q = re.sub(r'(^| )(b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *', r'\1', q, flags=re.IGNORECASE) - print(q) q = re.sub(r'([\d]{2})[\d]{3}(.*)c(e|é)dex ?[\d]*', r'\1\2', q, flags=re.IGNORECASE) - print(q) q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE) q = re.sub('\d{,2}(e|[eè]me) ([eé]tage)', '', q, flags=re.IGNORECASE) q = re.sub(' {2,}', ' ', q, flags=re.IGNORECASE) From 4e5be21774d2f9b437119ff15160f7652a9be5d0 Mon Sep 17 00:00:00 2001 From: cquest Date: Wed, 14 Feb 2018 23:58:20 +0100 Subject: [PATCH 05/15] cleanup phone/fax numbers --- addok_france/utils.py | 3 ++- tests/test_utils.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index 265cc01..205594a 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -50,11 +50,12 @@ def clean_query(q): - q = re.sub(r'([\d]{5})', r' \1 ', q, flags=re.IGNORECASE) q = re.sub(r'(^| )(b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *', r'\1', q, flags=re.IGNORECASE) q = re.sub(r'([\d]{2})[\d]{3}(.*)c(e|é)dex ?[\d]*', r'\1\2', q, flags=re.IGNORECASE) + q = re.sub(r'([^\d ])([\d]{5})([^\d]|$)', r'\1 \2 ', q, flags=re.IGNORECASE) q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE) q = re.sub('\d{,2}(e|[eè]me) ([eé]tage)', '', q, flags=re.IGNORECASE) + q = re.sub(r'((fax|t[eé]l|t[eé]l[eé]copieur)[ :,\.]*|)(\d{10}|[0-9][0-9][ -\./]\d\d[-\./ ]\d\d[-\./ ]\d\d[-\./ ]\d\d)', '', q, flags=re.IGNORECASE) q = re.sub(' {2,}', ' ', q, flags=re.IGNORECASE) q = re.sub('[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE) q = re.sub('[ -]s/s[ -]', ' sous ', q, flags=re.IGNORECASE) diff --git a/tests/test_utils.py b/tests/test_utils.py index 9ea13cc..5e4ce00 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -65,6 +65,16 @@ "20 rue saint germain 89110 Poilly-sur-tholon"), ("20 rue saint germain CIDEX N°304 89110 Poilly-sur-tholon", "20 rue saint germain 89110 Poilly-sur-tholon"), + ("20 rue saint germain 89110 Poilly-sur-tholon 01.23.45.67.89", + "20 rue saint germain 89110 Poilly-sur-tholon"), + ("32bis Rue des Vosges93290 fax: 0123456789", + "32bis Rue des Vosges 93290"), + ("32bis Rue des Vosges 93290 tel 01 23 45 67 89", + "32bis Rue des Vosges 93290"), + ("32bis Rue des Vosges 93290 telecopieur. 01/23/45/67/89", + "32bis Rue des Vosges 93290"), + ("32bis Rue des Vosges 93290 télécopieur, 01-23-45-67-89", + "32bis Rue des Vosges 93290"), ]) def test_clean_query(input, expected): assert clean_query(input) == expected From 454ca5006542d430677727e4474297a23e63fba2 Mon Sep 17 00:00:00 2001 From: cquest Date: Thu, 15 Feb 2018 00:27:54 +0100 Subject: [PATCH 06/15] fold initiales: F F I > F F I FFI, etc --- addok_france/utils.py | 1 + tests/test_utils.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/addok_france/utils.py b/addok_france/utils.py index 205594a..28100b8 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -60,6 +60,7 @@ def clean_query(q): q = re.sub('[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE) q = re.sub('[ -]s/s[ -]', ' sous ', q, flags=re.IGNORECASE) q = re.sub('^lieux?[ -]?dits?\\b(?=.)', '', q, flags=re.IGNORECASE) + q = re.sub(r'(^| )(([A-Z]) ([A-Z]) (([A-Z]) )?(([A-Z]) )?(([A-Z])( |$))?)', r'\1\2\3\4\6\8\10 ', q, flags=re.IGNORECASE) q = q.strip() return q diff --git a/tests/test_utils.py b/tests/test_utils.py index 5e4ce00..e5f128d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -75,6 +75,8 @@ "32bis Rue des Vosges 93290"), ("32bis Rue des Vosges 93290 télécopieur, 01-23-45-67-89", "32bis Rue des Vosges 93290"), + ("10 BLD DES F F I 85300 CHALLANS", + "10 BLD DES F F I FFI 85300 CHALLANS"), ]) def test_clean_query(input, expected): assert clean_query(input) == expected From 045c8f03a734ef823b41c399a411e6354b2049c5 Mon Sep 17 00:00:00 2001 From: cquest Date: Thu, 15 Feb 2018 01:02:29 +0100 Subject: [PATCH 07/15] boite postale --- addok_france/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index 28100b8..f1640d2 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -50,7 +50,7 @@ def clean_query(q): - q = re.sub(r'(^| )(b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *', r'\1', q, flags=re.IGNORECASE) + q = re.sub(r'(^| )(boite postale|b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *', r'\1', q, flags=re.IGNORECASE) q = re.sub(r'([\d]{2})[\d]{3}(.*)c(e|é)dex ?[\d]*', r'\1\2', q, flags=re.IGNORECASE) q = re.sub(r'([^\d ])([\d]{5})([^\d]|$)', r'\1 \2 ', q, flags=re.IGNORECASE) q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE) From 2261c92108b33e999779ae6eca56ed851aa17b36 Mon Sep 17 00:00:00 2001 From: Christian Quest Date: Fri, 16 Feb 2018 18:56:47 +0100 Subject: [PATCH 08/15] WIP fold_initials "F F I" > "FFI" --- addok_france/__init__.py | 1 + addok_france/utils.py | 15 +++++++++++++++ tests/test_utils.py | 30 +++++++++++++++++++++++++++++- 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/addok_france/__init__.py b/addok_france/__init__.py index 2b20ccb..8f48d8c 100644 --- a/addok_france/__init__.py +++ b/addok_france/__init__.py @@ -14,6 +14,7 @@ extract_address = yielder(utils.extract_address) glue_ordinal = utils.glue_ordinal fold_ordinal = yielder(utils.fold_ordinal) +fold_initials = yielder(utils.fold_initials) flag_housenumber = utils.flag_housenumber make_labels = utils.make_labels remove_leading_zeros = yielder(utils.remove_leading_zeros) diff --git a/addok_france/utils.py b/addok_france/utils.py index f1640d2..4b4f94a 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -133,6 +133,21 @@ def fold_ordinal(s): return s +def fold_initials(tokens): + initials = [] + for _, token, next_ in neighborhood(tokens): + if len(token)==1: + initials.append(token) + else: + if len(initial)>2: + initials[0].update("".join(initials)) + yield initials[0] + else: + for tk in initials: + yield tk + yield token + + def remove_leading_zeros(s): """0003 => 3.""" # Limit digits from 1 to 3 in order to avoid processing postcodes. diff --git a/tests/test_utils.py b/tests/test_utils.py index e5f128d..445ae1f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,7 +8,7 @@ from addok.helpers.text import Token from addok_france.utils import (clean_query, extract_address, flag_housenumber, fold_ordinal, glue_ordinal, make_labels, - remove_leading_zeros) + remove_leading_zeros, fold_initials) @pytest.mark.parametrize("input,expected", [ @@ -353,3 +353,31 @@ def test_make_municipality_labels(config): '59000 Lille', 'Lille 59000', ] + + +@pytest.mark.parametrize("inputs,expected", [ + (['6', 'bis'], ['6bis']), + (['6'], ['6']), + (['6', 'avenue'], ['6', 'avenue']), + (['60', 'bis', 'avenue'], ['60bis', 'avenue']), + (['600', 'ter', 'avenue'], ['600ter', 'avenue']), + (['6', 'quinquies', 'avenue'], ['6quinquies', 'avenue']), + (['60', 'sexies', 'avenue'], ['60sexies', 'avenue']), + (['600', 'quater', 'avenue'], ['600quater', 'avenue']), + (['6', 's', 'avenue'], ['6s', 'avenue']), + (['60b', 'avenue'], ['60b', 'avenue']), + (['600', 'b', 'avenue'], ['600b', 'avenue']), + (['241', 'r', 'de'], ['241', 'r', 'de']), + (['120', 'r', 'renard'], ['120', 'r', 'renard']), + (['241', 'r', 'rue'], ['241r', 'rue']), + (['place', 'des', 'terreaux'], ['place', 'des', 'terreaux']), + (['rue', 'du', 'bis'], ['rue', 'du', 'bis']), +]) +@pytest.mark.parametrize("input,expected", [ + (['allee','a','b','c'], ['allee','abc']), + (['allee','a','b','c','toto'], ['allee','abc','toto']), + (['allee','a','b','c','d'], ['allee','abcd']), + (['allee','a','b','c','d','e'], ['allee','abcde']), +]) +def test_fold_initials(input, expected): + assert fold_initials(Token(input)) == expected From 5f11ac443104c8c90fe3c0825a5ffa8ad2409875 Mon Sep 17 00:00:00 2001 From: Christian Quest Date: Fri, 16 Feb 2018 19:48:27 +0100 Subject: [PATCH 09/15] fold_initials + tests --- addok_france/utils.py | 15 +++++++++------ tests/test_utils.py | 27 ++++++--------------------- 2 files changed, 15 insertions(+), 27 deletions(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index 4b4f94a..6697a78 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -127,24 +127,27 @@ def fold_ordinal(s): else: try: s = s.update('{}{}'.format(number, - FOLD.get(ordinal.lower(), ordinal))) + FOLD.get(ordinal.lower(), ordinal))) except: pass return s def fold_initials(tokens): + """ folds 'F F I' into 'FFI' """ initials = [] for _, token, next_ in neighborhood(tokens): - if len(token)==1: + isinitial = len(token) == 1 and token.isalpha() + if isinitial: initials.append(token) - else: - if len(initial)>2: - initials[0].update("".join(initials)) - yield initials[0] + if not next_ or not isinitial: + if len(initials) > 2: + yield initials[0].update("".join(initials)) else: for tk in initials: yield tk + initials = [] + if not isinitial: yield token diff --git a/tests/test_utils.py b/tests/test_utils.py index 445ae1f..c08f55b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -356,28 +356,13 @@ def test_make_municipality_labels(config): @pytest.mark.parametrize("inputs,expected", [ - (['6', 'bis'], ['6bis']), - (['6'], ['6']), - (['6', 'avenue'], ['6', 'avenue']), - (['60', 'bis', 'avenue'], ['60bis', 'avenue']), - (['600', 'ter', 'avenue'], ['600ter', 'avenue']), - (['6', 'quinquies', 'avenue'], ['6quinquies', 'avenue']), - (['60', 'sexies', 'avenue'], ['60sexies', 'avenue']), - (['600', 'quater', 'avenue'], ['600quater', 'avenue']), - (['6', 's', 'avenue'], ['6s', 'avenue']), - (['60b', 'avenue'], ['60b', 'avenue']), - (['600', 'b', 'avenue'], ['600b', 'avenue']), - (['241', 'r', 'de'], ['241', 'r', 'de']), - (['120', 'r', 'renard'], ['120', 'r', 'renard']), - (['241', 'r', 'rue'], ['241r', 'rue']), - (['place', 'des', 'terreaux'], ['place', 'des', 'terreaux']), - (['rue', 'du', 'bis'], ['rue', 'du', 'bis']), -]) -@pytest.mark.parametrize("input,expected", [ - (['allee','a','b','c'], ['allee','abc']), (['allee','a','b','c','toto'], ['allee','abc','toto']), + (['allee','a','b','c','toto','d','e','f'], ['allee','abc','toto','def']), + (['allee','a','2','c','toto'], ['allee','a','2','c','toto']), + (['allee','a','b','c'], ['allee','abc']), (['allee','a','b','c','d'], ['allee','abcd']), (['allee','a','b','c','d','e'], ['allee','abcde']), ]) -def test_fold_initials(input, expected): - assert fold_initials(Token(input)) == expected +def test_fold_initials(inputs, expected): + tokens = [Token(input_) for input_ in inputs] + assert list(fold_initials(tokens)) == expected From 330169a89d923e9bb2d39339d2661db178711389 Mon Sep 17 00:00:00 2001 From: Christian Quest Date: Fri, 16 Feb 2018 19:48:58 +0100 Subject: [PATCH 10/15] pep8 --- addok_france/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index 6697a78..ec2dd55 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -111,7 +111,7 @@ def flag_housenumber(tokens): found = False for previous, token, next_ in neighborhood(tokens): if ((token.is_first or (next_ and TYPES_PATTERN.match(next_))) - and NUMBER_PATTERN.match(token) and not found): + and NUMBER_PATTERN.match(token) and not found): token.kind = 'housenumber' found = True yield token @@ -119,7 +119,7 @@ def flag_housenumber(tokens): def fold_ordinal(s): """3bis => 3b.""" - if s is not None and s !='' and s[0].isdigit() and not s.isdigit(): + if s is not None and s != '' and s[0].isdigit() and not s.isdigit(): try: number, ordinal = FOLD_PATTERN.findall(s)[0] except (IndexError, ValueError): From fd5d9ee94f81cb0f13c8f61aea8303ab089885b1 Mon Sep 17 00:00:00 2001 From: Christian Quest Date: Fri, 16 Feb 2018 20:25:13 +0100 Subject: [PATCH 11/15] pep8 --- addok_france/utils.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index ec2dd55..ab6235a 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -50,9 +50,12 @@ def clean_query(q): - q = re.sub(r'(^| )(boite postale|b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *', r'\1', q, flags=re.IGNORECASE) - q = re.sub(r'([\d]{2})[\d]{3}(.*)c(e|é)dex ?[\d]*', r'\1\2', q, flags=re.IGNORECASE) - q = re.sub(r'([^\d ])([\d]{5})([^\d]|$)', r'\1 \2 ', q, flags=re.IGNORECASE) + q = re.sub(r'(^| )(boite postale|b\.?p\.?|cs|tsa|cidex) *(n(o|°|) *|)[\d]+ *', + r'\1', q, flags=re.IGNORECASE) + q = re.sub(r'([\d]{2})[\d]{3}(.*)c(e|é)dex ?[\d]*', r'\1\2', + q, flags=re.IGNORECASE) + q = re.sub(r'([^\d ])([\d]{5})([^\d]|$)', r'\1 \2 ', + q, flags=re.IGNORECASE) q = re.sub('c(e|é)dex ?[\d]*', '', q, flags=re.IGNORECASE) q = re.sub('\d{,2}(e|[eè]me) ([eé]tage)', '', q, flags=re.IGNORECASE) q = re.sub(r'((fax|t[eé]l|t[eé]l[eé]copieur)[ :,\.]*|)(\d{10}|[0-9][0-9][ -\./]\d\d[-\./ ]\d\d[-\./ ]\d\d[-\./ ]\d\d)', '', q, flags=re.IGNORECASE) @@ -60,7 +63,8 @@ def clean_query(q): q = re.sub('[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE) q = re.sub('[ -]s/s[ -]', ' sous ', q, flags=re.IGNORECASE) q = re.sub('^lieux?[ -]?dits?\\b(?=.)', '', q, flags=re.IGNORECASE) - q = re.sub(r'(^| )(([A-Z]) ([A-Z]) (([A-Z]) )?(([A-Z]) )?(([A-Z])( |$))?)', r'\1\2\3\4\6\8\10 ', q, flags=re.IGNORECASE) + q = re.sub(r'(^| )(([A-Z]) ([A-Z]) (([A-Z]) )?(([A-Z]) )?(([A-Z])( |$))?)', + r'\1\2\3\4\6\8\10 ', q, flags=re.IGNORECASE) q = q.strip() return q From 3ccc267cf7d1630663a40936bdf7f479efc96676 Mon Sep 17 00:00:00 2001 From: Christian Quest Date: Fri, 16 Feb 2018 20:29:10 +0100 Subject: [PATCH 12/15] pep8 --- tests/test_utils.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index c08f55b..62962b5 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -356,12 +356,18 @@ def test_make_municipality_labels(config): @pytest.mark.parametrize("inputs,expected", [ - (['allee','a','b','c','toto'], ['allee','abc','toto']), - (['allee','a','b','c','toto','d','e','f'], ['allee','abc','toto','def']), - (['allee','a','2','c','toto'], ['allee','a','2','c','toto']), - (['allee','a','b','c'], ['allee','abc']), - (['allee','a','b','c','d'], ['allee','abcd']), - (['allee','a','b','c','d','e'], ['allee','abcde']), + (['allee', 'a', 'b', 'c', 'toto'], + ['allee', 'abc', 'toto']), + (['allee', 'a', 'b', 'c', 'toto', 'd', 'e', 'f'], + ['allee', 'abc', 'toto', 'def']), + (['allee', 'a', '2', 'c', 'toto'], + ['allee', 'a', '2', 'c', 'toto']), + (['allee', 'a', 'b', 'c'], + ['allee', 'abc']), + (['allee', 'a', 'b', 'c', 'd'], + ['allee', 'abcd']), + (['allee', 'a', 'b', 'c', 'd', 'e'], + ['allee', 'abcde']), ]) def test_fold_initials(inputs, expected): tokens = [Token(input_) for input_ in inputs] From 9b90da3a0a597ec36ee84a5476a5c765f053ff96 Mon Sep 17 00:00:00 2001 From: cquest Date: Fri, 16 Feb 2018 22:52:48 +0100 Subject: [PATCH 13/15] separate PR --- addok_france/utils.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/addok_france/utils.py b/addok_france/utils.py index ab6235a..e4414ce 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -123,17 +123,14 @@ def flag_housenumber(tokens): def fold_ordinal(s): """3bis => 3b.""" - if s is not None and s != '' and s[0].isdigit() and not s.isdigit(): + if s[0].isdigit() and not s.isdigit(): try: number, ordinal = FOLD_PATTERN.findall(s)[0] except (IndexError, ValueError): pass else: - try: - s = s.update('{}{}'.format(number, - FOLD.get(ordinal.lower(), ordinal))) - except: - pass + s = s.update('{}{}'.format(number, + FOLD.get(ordinal.lower(), ordinal))) return s From a3017f85751fcc945221a12d320787a1320ef77f Mon Sep 17 00:00:00 2001 From: cquest Date: Sun, 18 Feb 2018 17:25:26 +0100 Subject: [PATCH 14/15] fold_initials is a yielder --- addok_france/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/addok_france/__init__.py b/addok_france/__init__.py index 8f48d8c..4f55576 100644 --- a/addok_france/__init__.py +++ b/addok_france/__init__.py @@ -14,7 +14,7 @@ extract_address = yielder(utils.extract_address) glue_ordinal = utils.glue_ordinal fold_ordinal = yielder(utils.fold_ordinal) -fold_initials = yielder(utils.fold_initials) +fold_initials = utils.fold_initials flag_housenumber = utils.flag_housenumber make_labels = utils.make_labels remove_leading_zeros = yielder(utils.remove_leading_zeros) From bb92a1d35daf5e8c5cd5db222a2dda0162b9f1ed Mon Sep 17 00:00:00 2001 From: cquest Date: Wed, 21 Feb 2018 17:18:51 +0100 Subject: [PATCH 15/15] fold_initials > glue_initials --- addok_france/__init__.py | 2 +- addok_france/utils.py | 6 ++---- tests/test_utils.py | 8 ++++---- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/addok_france/__init__.py b/addok_france/__init__.py index 4f55576..c1ae1b7 100644 --- a/addok_france/__init__.py +++ b/addok_france/__init__.py @@ -14,7 +14,7 @@ extract_address = yielder(utils.extract_address) glue_ordinal = utils.glue_ordinal fold_ordinal = yielder(utils.fold_ordinal) -fold_initials = utils.fold_initials +glue_initials = utils.glue_initials flag_housenumber = utils.flag_housenumber make_labels = utils.make_labels remove_leading_zeros = yielder(utils.remove_leading_zeros) diff --git a/addok_france/utils.py b/addok_france/utils.py index e4414ce..c7d001c 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -63,8 +63,6 @@ def clean_query(q): q = re.sub('[ -]s/[ -]', ' sur ', q, flags=re.IGNORECASE) q = re.sub('[ -]s/s[ -]', ' sous ', q, flags=re.IGNORECASE) q = re.sub('^lieux?[ -]?dits?\\b(?=.)', '', q, flags=re.IGNORECASE) - q = re.sub(r'(^| )(([A-Z]) ([A-Z]) (([A-Z]) )?(([A-Z]) )?(([A-Z])( |$))?)', - r'\1\2\3\4\6\8\10 ', q, flags=re.IGNORECASE) q = q.strip() return q @@ -134,8 +132,8 @@ def fold_ordinal(s): return s -def fold_initials(tokens): - """ folds 'F F I' into 'FFI' """ +def glue_initials(tokens): + """ glue 'F F I' into 'FFI' """ initials = [] for _, token, next_ in neighborhood(tokens): isinitial = len(token) == 1 and token.isalpha() diff --git a/tests/test_utils.py b/tests/test_utils.py index 62962b5..eea4772 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,7 +8,7 @@ from addok.helpers.text import Token from addok_france.utils import (clean_query, extract_address, flag_housenumber, fold_ordinal, glue_ordinal, make_labels, - remove_leading_zeros, fold_initials) + remove_leading_zeros, glue_initials) @pytest.mark.parametrize("input,expected", [ @@ -76,7 +76,7 @@ ("32bis Rue des Vosges 93290 télécopieur, 01-23-45-67-89", "32bis Rue des Vosges 93290"), ("10 BLD DES F F I 85300 CHALLANS", - "10 BLD DES F F I FFI 85300 CHALLANS"), + "10 BLD DES F F I 85300 CHALLANS"), # done by glue_initials ]) def test_clean_query(input, expected): assert clean_query(input) == expected @@ -369,6 +369,6 @@ def test_make_municipality_labels(config): (['allee', 'a', 'b', 'c', 'd', 'e'], ['allee', 'abcde']), ]) -def test_fold_initials(inputs, expected): +def test_glue_initials(inputs, expected): tokens = [Token(input_) for input_ in inputs] - assert list(fold_initials(tokens)) == expected + assert list(glue_initials(tokens)) == expected