Skip to content

Commit

Permalink
Merge pull request #37 from dsi-clinic/linkage-code-clean-up
Browse files Browse the repository at this point in the history
linkage.py clean up including additions to constants.py
  • Loading branch information
averyschoen authored Mar 5, 2024
2 parents 3c2005f + 793b8af commit 0af314c
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 346 deletions.
28 changes: 28 additions & 0 deletions utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -727,3 +727,31 @@
"lcv victory",
"league of conservation",
]

suffixes = [
"sr",
"jr",
"i",
"ii",
"iii",
"iv",
"v",
"vi",
"vii",
"viii",
"ix",
"x",
]

titles = [
"mr",
"ms",
"mrs",
"miss",
"prof",
"dr",
"doctor",
"sir",
"madam",
"professor",
]
268 changes: 25 additions & 243 deletions utils/linkage.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
import math
import os.path
import re

import numpy as np
import pandas as pd
import textdistance as td
import usaddress
from names_dataset import NameDataset
from splink.duckdb.linker import DuckDBLinker

from utils.constants import COMPANY_TYPES, repo_root
from utils.constants import COMPANY_TYPES, repo_root, suffixes, titles

"""
Module for performing record linkage on state campaign finance dataset
Expand All @@ -21,10 +18,14 @@ def get_address_line_1_from_full_address(address: str) -> str:
Address line 1 usually includes street address or PO Box information.
Uses the usaddress libray which splits an address string into components,
and labels each component.
https://usaddress.readthedocs.io/en/latest/
Args:
address: raw string representing full address
Returns:
address_line_1
address_line_1 as a string
Sample Usage:
>>> get_address_line_1_from_full_address('6727 W. Corrine Dr. Peoria,AZ 85381')
Expand All @@ -42,7 +43,7 @@ def get_address_line_1_from_full_address(address: str) -> str:

address_tuples = usaddress.parse(
address
) # takes a string address and put them into value,key pairs as tuples
) # takes a string address and put them into value, key pairs as tuples
line1_components = []
for value, key in address_tuples:
if key == "PlaceName":
Expand All @@ -60,167 +61,6 @@ def get_address_line_1_from_full_address(address: str) -> str:
return line1


def calculate_string_similarity(string1: str, string2: str) -> float:
"""Returns how similar two strings are on a scale of 0 to 1
This version utilizes Jaro-Winkler distance, which is a metric of
edit distance. Jaro-Winkler specially prioritizes the early
characters in a string.
Since the ends of strings are often more valuable in matching names
and addresses, we reverse the strings before matching them.
https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaro-winkler.js
The exact meaning of the metric is open, but the following must hold true:
1. equivalent strings must return 1
2. strings with no similar characters must return 0
3. strings with higher intuitive similarity must return higher scores
similarity score
Args:
string1: any string
string2: any string
Returns:
similarity score
Sample Usage:
>>> calculate_string_similarity("exact match", "exact match")
1.0
>>> calculate_string_similarity("aaaaaa", "bbbbbbbbbbb")
0.0
>>> similar_score = calculate_string_similarity("very similar", "vary similar")
>>> different_score = calculate_string_similarity("very similar", "very not close")
>>> similar_score > different_score
True
"""

return float(td.jaro_winkler(string1.lower()[::-1], string2.lower()[::-1]))


def calculate_row_similarity(
row1: pd.DataFrame, row2: pd.DataFrame, weights: np.array, comparison_func
) -> float:
"""Find weighted similarity of two rows in a dataframe
The length of the weights vector must be the same as
the number of selected columns.
This version is slow and not optimized, and will be
revised in order to make it more efficient. It
exists as to provide basic functionality. Once we have
the comparison function locked in, using .apply will
likely be easier and more efficient.
"""

row_length = len(weights)
if not (row1.shape[1] == row2.shape[1] == row_length):
raise ValueError("Number of columns and weights must be the same")

similarity = np.zeros(row_length)

for i in range(row_length):
similarity[i] = comparison_func(
row1.reset_index().drop(columns="index").iloc[:, i][0],
row2.reset_index().drop(columns="index").iloc[:, i][0],
)

return sum(similarity * weights)


def row_matches(
df: pd.DataFrame, weights: np.array, threshold: float, comparison_func
) -> dict:
"""Get weighted similarity score of two rows
Run through the rows using indices: if two rows have a comparison score
greater than a threshold, we assign the later row to the former. Any
row which is matched to any other row is not examined again. Matches are
stored in a dictionary object, with each index appearing no more than once.
This is not optimized. Not presently sure how to make a good test case
for this, will submit and ask in mentor session.
"""

all_indices = np.array(list(df.index))

index_dict = {}
[index_dict.setdefault(x, []) for x in all_indices]

discard_indices = []

end = max(all_indices)
for i in all_indices:
# Skip indices that have been stored in the discard_indices list
if i in discard_indices:
continue

# Iterate through the remaining numbers
for j in range(i + 1, end):
if j in discard_indices:
continue

# Our conditional
if (
calculate_row_similarity(
df.iloc[[i]], df.iloc[[j]], weights, comparison_func
)
> threshold
):
# Store the other index and mark it for skipping in future iterations
discard_indices.append(j)
index_dict[i].append(j)

return index_dict


def match_confidence(
confidences: np.array(float), weights: np.array(float), weights_toggle: bool
) -> float:
"""Combine confidences for row matches into a final confidence
This is a weighted log-odds based combination of row match confidences
originating from various record linkage methods. Weights will be applied
to the linkage methods in order and must be of the same length.
weights_toggle allows one to turn weights on and off when calling the
function. False cancels the use of weights.
Since log-odds have undesirable behaviors at 0 and 1, we truncate at
+-5, which corresponds to around half a percent probability or
1 - the same.
>>> match_confidence(np.array([.6, .9, .0001]), np.array([2,5.7,8]), True)
2.627759082143462e-12
>>> match_confidence(np.array([.6, .9, .0001]), np.array([2,5.7,8]), False)
0.08337802853594725
"""

if (min(confidences) < 0) or (max(confidences) > 1):
raise ValueError("Probabilities must be bounded on [0, 1]")

log_odds = []

for c in confidences:
l_o = np.log(c / (1 - c))

if l_o > 5:
l_o = 5

elif l_o < -5:
l_o = -5

log_odds.append(l_o)

if weights_toggle:
log_odds = log_odds * weights

l_o_sum = np.sum(log_odds)

conf_sum = math.e ** (l_o_sum) / (1 + math.e ** (l_o_sum))
return conf_sum


def determine_comma_role(name: str) -> str:
"""Given a string (someone's name), attempts to determine the role of the
comma in the name and where it ought to belong.
Expand All @@ -247,20 +87,7 @@ def determine_comma_role(name: str) -> str:
>>> determine_comma_role("DOe, Jane")
' Jane Doe'
"""
suffixes = [
"sr",
"jr",
"i",
"ii",
"iii",
"iv",
"v",
"vi",
"vii",
"viii",
"ix",
"x",
]

name_parts = name.lower().split(",")
# if the comma is just in the end as a typo:
if len(name_parts[1]) == 0:
Expand Down Expand Up @@ -326,18 +153,6 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:

# some names have titles or professions associated with the name. We need to
# remove those from the name.
titles = [
"mr",
"ms",
"mrs",
"miss",
"prof",
"dr",
"doctor",
"sir",
"madam",
"professor",
]
names = [first_name, last_name, full_name]

for i in range(len(names)):
Expand All @@ -363,10 +178,14 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
def get_street_from_address_line_1(address_line_1: str) -> str:
"""Given an address line 1, return the street name
Uses the usaddress libray which splits an address string into components,
and labels each component.
https://usaddress.readthedocs.io/en/latest/
Args:
address_line_1: either street information or PO box
address_line_1: either street information or PO box as a string
Returns:
street name
street name as a string
Raises:
ValueError: if string is malformed and no street can be reasonably
found.
Expand Down Expand Up @@ -405,54 +224,6 @@ def get_street_from_address_line_1(address_line_1: str) -> str:
return " ".join(string)


def name_rank(first_name: str, last_name: str) -> list:
"""Returns a score for the rank of a given first name and last name
https://github.com/philipperemy/name-dataset
Args:
first_name: any string
last_name: any string
Returns:
name rank for first name and last names
1 is the most common name, only for names in the United States
First element in the list corresponds to the rank of the first name
Second element in the list corresponds to the rank of the last name
Empty or non string values will return None
Names that are not found in the dataset will return 0
>>> name_rank("John", "Smith")
[5, 7]
>>> name_rank("Adil", "Kassim")
[0, 7392]
>>> name_rank(None, 9)
[None, None]
"""

# Initialize the NameDataset class
nd = NameDataset()

first_name_rank = 0
last_name_rank = 0
if isinstance(first_name, str):
first_name_result = nd.search(first_name)
if first_name_result and isinstance(first_name_result, dict):
first_name_data = first_name_result.get("first_name")
if first_name_data and "rank" in first_name_data:
first_name_rank = first_name_data["rank"].get(
"United States", 0
)
else:
first_name_rank = None
if isinstance(last_name, str):
last_name_result = nd.search(last_name)
if last_name_result and isinstance(last_name_result, dict):
last_name_data = last_name_result.get("last_name")
if last_name_data and "rank" in last_name_data:
last_name_rank = last_name_data["rank"].get("United States", 0)
else:
last_name_rank = None
return [first_name_rank, last_name_rank]


def convert_duplicates_to_dict(df: pd.DataFrame) -> None:
"""For each uuid, maps it to all other uuids for which it has been deemed a
match.
Expand Down Expand Up @@ -537,6 +308,7 @@ def cleaning_company_column(company_entry: str) -> str:
standardized for retired, self employed, and unemployed,
or original string if no match or empty string
Sample Usage:
>>> cleaning_company_column("Retireed")
'Retired'
>>> cleaning_company_column("self")
Expand Down Expand Up @@ -592,6 +364,7 @@ def standardize_corp_names(company_name: str) -> str:
Returns:
standardized company name
Sample Usage:
>>> standardize_corp_names('MI BEER WINE WHOLESALERS ASSOC')
'MI BEER WINE WHOLESALERS ASSOCIATION'
Expand All @@ -617,6 +390,10 @@ def standardize_corp_names(company_name: str) -> str:
def get_address_number_from_address_line_1(address_line_1: str) -> str:
"""Given an address line 1, return the building number or po box
Uses the usaddress libray which splits an address string into components,
and labels each component.
https://usaddress.readthedocs.io/en/latest/
Args:
address_line_1: either street information or PO box
Returns:
Expand Down Expand Up @@ -655,6 +432,11 @@ def splink_dedupe(
individuals_settings, indivduals_blocking, organizations_settings,
organizations_blocking
Uses the splink library which employs probabilistic matching for
record linkage
https://moj-analytical-services.github.io/splink/index.html
Args:
df: dataframe
settings: configuration settings
Expand Down
Loading

0 comments on commit 0af314c

Please sign in to comment.