diff --git a/utils/constants.py b/utils/constants.py index b4be256..bc6d4cf 100644 --- a/utils/constants.py +++ b/utils/constants.py @@ -727,3 +727,31 @@ "lcv victory", "league of conservation", ] + +suffixes = [ + "sr", + "jr", + "i", + "ii", + "iii", + "iv", + "v", + "vi", + "vii", + "viii", + "ix", + "x", +] + +titles = [ + "mr", + "ms", + "mrs", + "miss", + "prof", + "dr", + "doctor", + "sir", + "madam", + "professor", +] diff --git a/utils/linkage.py b/utils/linkage.py index 662d18e..32a44df 100644 --- a/utils/linkage.py +++ b/utils/linkage.py @@ -1,15 +1,12 @@ -import math import os.path import re import numpy as np import pandas as pd -import textdistance as td import usaddress -from names_dataset import NameDataset from splink.duckdb.linker import DuckDBLinker -from utils.constants import COMPANY_TYPES, repo_root +from utils.constants import COMPANY_TYPES, repo_root, suffixes, titles """ Module for performing record linkage on state campaign finance dataset @@ -21,10 +18,14 @@ def get_address_line_1_from_full_address(address: str) -> str: Address line 1 usually includes street address or PO Box information. + Uses the usaddress libray which splits an address string into components, + and labels each component. + https://usaddress.readthedocs.io/en/latest/ + Args: address: raw string representing full address Returns: - address_line_1 + address_line_1 as a string Sample Usage: >>> get_address_line_1_from_full_address('6727 W. Corrine Dr. Peoria,AZ 85381') @@ -42,7 +43,7 @@ def get_address_line_1_from_full_address(address: str) -> str: address_tuples = usaddress.parse( address - ) # takes a string address and put them into value,key pairs as tuples + ) # takes a string address and put them into value, key pairs as tuples line1_components = [] for value, key in address_tuples: if key == "PlaceName": @@ -60,167 +61,6 @@ def get_address_line_1_from_full_address(address: str) -> str: return line1 -def calculate_string_similarity(string1: str, string2: str) -> float: - """Returns how similar two strings are on a scale of 0 to 1 - - This version utilizes Jaro-Winkler distance, which is a metric of - edit distance. Jaro-Winkler specially prioritizes the early - characters in a string. - - Since the ends of strings are often more valuable in matching names - and addresses, we reverse the strings before matching them. - - https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance - https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaro-winkler.js - - The exact meaning of the metric is open, but the following must hold true: - 1. equivalent strings must return 1 - 2. strings with no similar characters must return 0 - 3. strings with higher intuitive similarity must return higher scores - similarity score - - Args: - string1: any string - string2: any string - Returns: - similarity score - - Sample Usage: - >>> calculate_string_similarity("exact match", "exact match") - 1.0 - >>> calculate_string_similarity("aaaaaa", "bbbbbbbbbbb") - 0.0 - >>> similar_score = calculate_string_similarity("very similar", "vary similar") - >>> different_score = calculate_string_similarity("very similar", "very not close") - >>> similar_score > different_score - True - """ - - return float(td.jaro_winkler(string1.lower()[::-1], string2.lower()[::-1])) - - -def calculate_row_similarity( - row1: pd.DataFrame, row2: pd.DataFrame, weights: np.array, comparison_func -) -> float: - """Find weighted similarity of two rows in a dataframe - - The length of the weights vector must be the same as - the number of selected columns. - - This version is slow and not optimized, and will be - revised in order to make it more efficient. It - exists as to provide basic functionality. Once we have - the comparison function locked in, using .apply will - likely be easier and more efficient. - """ - - row_length = len(weights) - if not (row1.shape[1] == row2.shape[1] == row_length): - raise ValueError("Number of columns and weights must be the same") - - similarity = np.zeros(row_length) - - for i in range(row_length): - similarity[i] = comparison_func( - row1.reset_index().drop(columns="index").iloc[:, i][0], - row2.reset_index().drop(columns="index").iloc[:, i][0], - ) - - return sum(similarity * weights) - - -def row_matches( - df: pd.DataFrame, weights: np.array, threshold: float, comparison_func -) -> dict: - """Get weighted similarity score of two rows - - Run through the rows using indices: if two rows have a comparison score - greater than a threshold, we assign the later row to the former. Any - row which is matched to any other row is not examined again. Matches are - stored in a dictionary object, with each index appearing no more than once. - - This is not optimized. Not presently sure how to make a good test case - for this, will submit and ask in mentor session. - """ - - all_indices = np.array(list(df.index)) - - index_dict = {} - [index_dict.setdefault(x, []) for x in all_indices] - - discard_indices = [] - - end = max(all_indices) - for i in all_indices: - # Skip indices that have been stored in the discard_indices list - if i in discard_indices: - continue - - # Iterate through the remaining numbers - for j in range(i + 1, end): - if j in discard_indices: - continue - - # Our conditional - if ( - calculate_row_similarity( - df.iloc[[i]], df.iloc[[j]], weights, comparison_func - ) - > threshold - ): - # Store the other index and mark it for skipping in future iterations - discard_indices.append(j) - index_dict[i].append(j) - - return index_dict - - -def match_confidence( - confidences: np.array(float), weights: np.array(float), weights_toggle: bool -) -> float: - """Combine confidences for row matches into a final confidence - - This is a weighted log-odds based combination of row match confidences - originating from various record linkage methods. Weights will be applied - to the linkage methods in order and must be of the same length. - - weights_toggle allows one to turn weights on and off when calling the - function. False cancels the use of weights. - - Since log-odds have undesirable behaviors at 0 and 1, we truncate at - +-5, which corresponds to around half a percent probability or - 1 - the same. - >>> match_confidence(np.array([.6, .9, .0001]), np.array([2,5.7,8]), True) - 2.627759082143462e-12 - >>> match_confidence(np.array([.6, .9, .0001]), np.array([2,5.7,8]), False) - 0.08337802853594725 - """ - - if (min(confidences) < 0) or (max(confidences) > 1): - raise ValueError("Probabilities must be bounded on [0, 1]") - - log_odds = [] - - for c in confidences: - l_o = np.log(c / (1 - c)) - - if l_o > 5: - l_o = 5 - - elif l_o < -5: - l_o = -5 - - log_odds.append(l_o) - - if weights_toggle: - log_odds = log_odds * weights - - l_o_sum = np.sum(log_odds) - - conf_sum = math.e ** (l_o_sum) / (1 + math.e ** (l_o_sum)) - return conf_sum - - def determine_comma_role(name: str) -> str: """Given a string (someone's name), attempts to determine the role of the comma in the name and where it ought to belong. @@ -247,20 +87,7 @@ def determine_comma_role(name: str) -> str: >>> determine_comma_role("DOe, Jane") ' Jane Doe' """ - suffixes = [ - "sr", - "jr", - "i", - "ii", - "iii", - "iv", - "v", - "vi", - "vii", - "viii", - "ix", - "x", - ] + name_parts = name.lower().split(",") # if the comma is just in the end as a typo: if len(name_parts[1]) == 0: @@ -326,18 +153,6 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: # some names have titles or professions associated with the name. We need to # remove those from the name. - titles = [ - "mr", - "ms", - "mrs", - "miss", - "prof", - "dr", - "doctor", - "sir", - "madam", - "professor", - ] names = [first_name, last_name, full_name] for i in range(len(names)): @@ -363,10 +178,14 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str: def get_street_from_address_line_1(address_line_1: str) -> str: """Given an address line 1, return the street name + Uses the usaddress libray which splits an address string into components, + and labels each component. + https://usaddress.readthedocs.io/en/latest/ + Args: - address_line_1: either street information or PO box + address_line_1: either street information or PO box as a string Returns: - street name + street name as a string Raises: ValueError: if string is malformed and no street can be reasonably found. @@ -405,54 +224,6 @@ def get_street_from_address_line_1(address_line_1: str) -> str: return " ".join(string) -def name_rank(first_name: str, last_name: str) -> list: - """Returns a score for the rank of a given first name and last name - https://github.com/philipperemy/name-dataset - Args: - first_name: any string - last_name: any string - Returns: - name rank for first name and last names - 1 is the most common name, only for names in the United States - First element in the list corresponds to the rank of the first name - Second element in the list corresponds to the rank of the last name - Empty or non string values will return None - Names that are not found in the dataset will return 0 - - >>> name_rank("John", "Smith") - [5, 7] - >>> name_rank("Adil", "Kassim") - [0, 7392] - >>> name_rank(None, 9) - [None, None] - """ - - # Initialize the NameDataset class - nd = NameDataset() - - first_name_rank = 0 - last_name_rank = 0 - if isinstance(first_name, str): - first_name_result = nd.search(first_name) - if first_name_result and isinstance(first_name_result, dict): - first_name_data = first_name_result.get("first_name") - if first_name_data and "rank" in first_name_data: - first_name_rank = first_name_data["rank"].get( - "United States", 0 - ) - else: - first_name_rank = None - if isinstance(last_name, str): - last_name_result = nd.search(last_name) - if last_name_result and isinstance(last_name_result, dict): - last_name_data = last_name_result.get("last_name") - if last_name_data and "rank" in last_name_data: - last_name_rank = last_name_data["rank"].get("United States", 0) - else: - last_name_rank = None - return [first_name_rank, last_name_rank] - - def convert_duplicates_to_dict(df: pd.DataFrame) -> None: """For each uuid, maps it to all other uuids for which it has been deemed a match. @@ -537,6 +308,7 @@ def cleaning_company_column(company_entry: str) -> str: standardized for retired, self employed, and unemployed, or original string if no match or empty string + Sample Usage: >>> cleaning_company_column("Retireed") 'Retired' >>> cleaning_company_column("self") @@ -592,6 +364,7 @@ def standardize_corp_names(company_name: str) -> str: Returns: standardized company name + Sample Usage: >>> standardize_corp_names('MI BEER WINE WHOLESALERS ASSOC') 'MI BEER WINE WHOLESALERS ASSOCIATION' @@ -617,6 +390,10 @@ def standardize_corp_names(company_name: str) -> str: def get_address_number_from_address_line_1(address_line_1: str) -> str: """Given an address line 1, return the building number or po box + Uses the usaddress libray which splits an address string into components, + and labels each component. + https://usaddress.readthedocs.io/en/latest/ + Args: address_line_1: either street information or PO box Returns: @@ -655,6 +432,11 @@ def splink_dedupe( individuals_settings, indivduals_blocking, organizations_settings, organizations_blocking + Uses the splink library which employs probabilistic matching for + record linkage + https://moj-analytical-services.github.io/splink/index.html + + Args: df: dataframe settings: configuration settings diff --git a/utils/tests/test_linkage.py b/utils/tests/test_linkage.py index 4a5f73f..d96339d 100644 --- a/utils/tests/test_linkage.py +++ b/utils/tests/test_linkage.py @@ -1,115 +1,13 @@ -import numpy as np import pandas as pd import pytest from utils.constants import BASE_FILEPATH -from utils.linkage import ( - calculate_row_similarity, - calculate_string_similarity, - deduplicate_perfect_matches, - row_matches, -) +from utils.linkage import deduplicate_perfect_matches """ Module for testing functions in linkage.py """ -# Creating a test for calculate_row_similarity and row_matches - -# to put in data: -d = { - "name": ["bob von rosevich", "anantarya smith", "bob j vonrosevich"], - "address": [ - "3 Blue Drive, Chicago", - "4 Blue Drive, Chicago", - "8 Fancy Way, Chicago", - ], -} - -test_df = pd.DataFrame(data=d) - - -@pytest.fixture -def row_similarity_scen_1(): - return test_df - - -@pytest.fixture -def row_similarity_scen_2(): - return test_df - - -def test_row_similarity_scen_1(row_similarity_scen_1): - wrong = calculate_row_similarity( - row_similarity_scen_1.iloc[[0]], - row_similarity_scen_1.iloc[[1]], - np.array([0.8, 0.2]), - calculate_string_similarity, - ) - right = calculate_row_similarity( - row_similarity_scen_1.iloc[[0]], - row_similarity_scen_1.iloc[[2]], - np.array([0.8, 0.2]), - calculate_string_similarity, - ) - - assert right > wrong - - -def test_row_similarity_scen_2(row_similarity_scen_2): - wrong = calculate_row_similarity( - row_similarity_scen_2.iloc[[0]], - row_similarity_scen_2.iloc[[1]], - np.array([0.2, 0.8]), - calculate_string_similarity, - ) - right = calculate_row_similarity( - row_similarity_scen_2.iloc[[0]], - row_similarity_scen_2.iloc[[2]], - np.array([0.2, 0.8]), - calculate_string_similarity, - ) - - assert right < wrong - - -d2 = { - "name": [ - "bob von rosevich", - "anantarya smith", - "bob j vonrosevich", - "missy elliot", - "mr johnson", - "quarantin directino", - "missy eliot", - "joseph johnson", - ], - "address": [ - "3 Blue Drive, Chicago", - "4 Blue Drive, Chicago", - "8 Fancy Way, Chicago", - "8 Fancy Way, Evanston", - "17 Regular Road, Chicago", - "42 Hollywood Boulevard, Chicago", - "8 Fancy Way, Evanston", - "17 Regular Road, Chicago", - ], -} -test_df2 = pd.DataFrame(data=d2) - - -@pytest.fixture -def row_match_scen1(): - return test_df2 - - -def test_row_matches(row_match_scen1): - res = row_matches( - row_match_scen1, np.array([0.8, 0.2]), 0.9, calculate_string_similarity - ) - - assert res == {0: [2], 1: [], 2: [], 3: [6], 4: [], 5: [], 6: [], 7: []} - # Test for dedupe function @pytest.fixture