diff --git a/utils/constants.py b/utils/constants.py
index b4be256..bc6d4cf 100644
--- a/utils/constants.py
+++ b/utils/constants.py
@@ -727,3 +727,31 @@
     "lcv victory",
     "league of conservation",
 ]
+
+suffixes = [
+    "sr",
+    "jr",
+    "i",
+    "ii",
+    "iii",
+    "iv",
+    "v",
+    "vi",
+    "vii",
+    "viii",
+    "ix",
+    "x",
+]
+
+titles = [
+    "mr",
+    "ms",
+    "mrs",
+    "miss",
+    "prof",
+    "dr",
+    "doctor",
+    "sir",
+    "madam",
+    "professor",
+]
diff --git a/utils/linkage.py b/utils/linkage.py
index 662d18e..32a44df 100644
--- a/utils/linkage.py
+++ b/utils/linkage.py
@@ -1,15 +1,12 @@
-import math
 import os.path
 import re
 
 import numpy as np
 import pandas as pd
-import textdistance as td
 import usaddress
-from names_dataset import NameDataset
 from splink.duckdb.linker import DuckDBLinker
 
-from utils.constants import COMPANY_TYPES, repo_root
+from utils.constants import COMPANY_TYPES, repo_root, suffixes, titles
 
 """
 Module for performing record linkage on state campaign finance dataset
@@ -21,10 +18,14 @@ def get_address_line_1_from_full_address(address: str) -> str:
 
     Address line 1 usually includes street address or PO Box information.
 
+    Uses the usaddress libray which splits an address string into components,
+    and labels each component.
+    https://usaddress.readthedocs.io/en/latest/
+
     Args:
         address: raw string representing full address
     Returns:
-        address_line_1
+        address_line_1 as a string
 
     Sample Usage:
     >>> get_address_line_1_from_full_address('6727 W. Corrine Dr.  Peoria,AZ 85381')
@@ -42,7 +43,7 @@ def get_address_line_1_from_full_address(address: str) -> str:
 
     address_tuples = usaddress.parse(
         address
-    )  # takes a string address and put them into value,key pairs as tuples
+    )  # takes a string address and put them into value, key pairs as tuples
     line1_components = []
     for value, key in address_tuples:
         if key == "PlaceName":
@@ -60,167 +61,6 @@ def get_address_line_1_from_full_address(address: str) -> str:
     return line1
 
 
-def calculate_string_similarity(string1: str, string2: str) -> float:
-    """Returns how similar two strings are on a scale of 0 to 1
-
-    This version utilizes Jaro-Winkler distance, which is a metric of
-    edit distance. Jaro-Winkler specially prioritizes the early
-    characters in a string.
-
-    Since the ends of strings are often more valuable in matching names
-    and addresses, we reverse the strings before matching them.
-
-    https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
-    https://github.com/Yomguithereal/talisman/blob/master/src/metrics/jaro-winkler.js
-
-    The exact meaning of the metric is open, but the following must hold true:
-    1. equivalent strings must return 1
-    2. strings with no similar characters must return 0
-    3. strings with higher intuitive similarity must return higher scores
-    similarity score
-
-    Args:
-        string1: any string
-        string2: any string
-    Returns:
-        similarity score
-
-    Sample Usage:
-    >>> calculate_string_similarity("exact match", "exact match")
-    1.0
-    >>> calculate_string_similarity("aaaaaa", "bbbbbbbbbbb")
-    0.0
-    >>> similar_score = calculate_string_similarity("very similar", "vary similar")
-    >>> different_score = calculate_string_similarity("very similar", "very not close")
-    >>> similar_score > different_score
-    True
-    """
-
-    return float(td.jaro_winkler(string1.lower()[::-1], string2.lower()[::-1]))
-
-
-def calculate_row_similarity(
-    row1: pd.DataFrame, row2: pd.DataFrame, weights: np.array, comparison_func
-) -> float:
-    """Find weighted similarity of two rows in a dataframe
-
-    The length of the weights vector must be the same as
-    the number of selected columns.
-
-    This version is slow and not optimized, and will be
-    revised in order to make it more efficient. It
-    exists as to provide basic functionality. Once we have
-    the comparison function locked in, using .apply will
-    likely be easier and more efficient.
-    """
-
-    row_length = len(weights)
-    if not (row1.shape[1] == row2.shape[1] == row_length):
-        raise ValueError("Number of columns and weights must be the same")
-
-    similarity = np.zeros(row_length)
-
-    for i in range(row_length):
-        similarity[i] = comparison_func(
-            row1.reset_index().drop(columns="index").iloc[:, i][0],
-            row2.reset_index().drop(columns="index").iloc[:, i][0],
-        )
-
-    return sum(similarity * weights)
-
-
-def row_matches(
-    df: pd.DataFrame, weights: np.array, threshold: float, comparison_func
-) -> dict:
-    """Get weighted similarity score of two rows
-
-    Run through the rows using indices: if two rows have a comparison score
-    greater than a threshold, we assign the later row to the former. Any
-    row which is matched to any other row is not examined again. Matches are
-    stored in a dictionary object, with each index appearing no more than once.
-
-    This is not optimized. Not presently sure how to make a good test case
-    for this, will submit and ask in mentor session.
-    """
-
-    all_indices = np.array(list(df.index))
-
-    index_dict = {}
-    [index_dict.setdefault(x, []) for x in all_indices]
-
-    discard_indices = []
-
-    end = max(all_indices)
-    for i in all_indices:
-        # Skip indices that have been stored in the discard_indices list
-        if i in discard_indices:
-            continue
-
-        # Iterate through the remaining numbers
-        for j in range(i + 1, end):
-            if j in discard_indices:
-                continue
-
-            # Our conditional
-            if (
-                calculate_row_similarity(
-                    df.iloc[[i]], df.iloc[[j]], weights, comparison_func
-                )
-                > threshold
-            ):
-                # Store the other index and mark it for skipping in future iterations
-                discard_indices.append(j)
-                index_dict[i].append(j)
-
-    return index_dict
-
-
-def match_confidence(
-    confidences: np.array(float), weights: np.array(float), weights_toggle: bool
-) -> float:
-    """Combine confidences for row matches into a final confidence
-
-    This is a weighted log-odds based combination of row match confidences
-    originating from various record linkage methods. Weights will be applied
-    to the linkage methods in order and must be of the same length.
-
-    weights_toggle allows one to turn weights on and off when calling the
-    function. False cancels the use of weights.
-
-    Since log-odds have undesirable behaviors at 0 and 1, we truncate at
-    +-5, which corresponds to around half a percent probability or
-    1 - the same.
-    >>> match_confidence(np.array([.6, .9, .0001]), np.array([2,5.7,8]), True)
-    2.627759082143462e-12
-    >>> match_confidence(np.array([.6, .9, .0001]), np.array([2,5.7,8]), False)
-    0.08337802853594725
-    """
-
-    if (min(confidences) < 0) or (max(confidences) > 1):
-        raise ValueError("Probabilities must be bounded on [0, 1]")
-
-    log_odds = []
-
-    for c in confidences:
-        l_o = np.log(c / (1 - c))
-
-        if l_o > 5:
-            l_o = 5
-
-        elif l_o < -5:
-            l_o = -5
-
-        log_odds.append(l_o)
-
-    if weights_toggle:
-        log_odds = log_odds * weights
-
-    l_o_sum = np.sum(log_odds)
-
-    conf_sum = math.e ** (l_o_sum) / (1 + math.e ** (l_o_sum))
-    return conf_sum
-
-
 def determine_comma_role(name: str) -> str:
     """Given a string (someone's name), attempts to determine the role of the
     comma in the name and where it ought to belong.
@@ -247,20 +87,7 @@ def determine_comma_role(name: str) -> str:
     >>> determine_comma_role("DOe, Jane")
     ' Jane Doe'
     """
-    suffixes = [
-        "sr",
-        "jr",
-        "i",
-        "ii",
-        "iii",
-        "iv",
-        "v",
-        "vi",
-        "vii",
-        "viii",
-        "ix",
-        "x",
-    ]
+
     name_parts = name.lower().split(",")
     # if the comma is just in the end as a typo:
     if len(name_parts[1]) == 0:
@@ -326,18 +153,6 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
 
     # some names have titles or professions associated with the name. We need to
     # remove those from the name.
-    titles = [
-        "mr",
-        "ms",
-        "mrs",
-        "miss",
-        "prof",
-        "dr",
-        "doctor",
-        "sir",
-        "madam",
-        "professor",
-    ]
     names = [first_name, last_name, full_name]
 
     for i in range(len(names)):
@@ -363,10 +178,14 @@ def get_likely_name(first_name: str, last_name: str, full_name: str) -> str:
 def get_street_from_address_line_1(address_line_1: str) -> str:
     """Given an address line 1, return the street name
 
+    Uses the usaddress libray which splits an address string into components,
+    and labels each component.
+    https://usaddress.readthedocs.io/en/latest/
+
     Args:
-        address_line_1: either street information or PO box
+        address_line_1: either street information or PO box as a string
     Returns:
-        street name
+        street name as a string
     Raises:
         ValueError: if string is malformed and no street can be reasonably
             found.
@@ -405,54 +224,6 @@ def get_street_from_address_line_1(address_line_1: str) -> str:
     return " ".join(string)
 
 
-def name_rank(first_name: str, last_name: str) -> list:
-    """Returns a score for the rank of a given first name and last name
-    https://github.com/philipperemy/name-dataset
-    Args:
-        first_name: any string
-        last_name: any string
-    Returns:
-        name rank for first name and last names
-        1 is the most common name, only for names in the United States
-        First element in the list corresponds to the rank of the first name
-        Second element in the list corresponds to the rank of the last name
-        Empty or non string values will return None
-        Names that are not found in the dataset will return 0
-
-    >>> name_rank("John", "Smith")
-    [5, 7]
-    >>> name_rank("Adil", "Kassim")
-    [0, 7392]
-    >>> name_rank(None, 9)
-    [None, None]
-    """
-
-    # Initialize the NameDataset class
-    nd = NameDataset()
-
-    first_name_rank = 0
-    last_name_rank = 0
-    if isinstance(first_name, str):
-        first_name_result = nd.search(first_name)
-        if first_name_result and isinstance(first_name_result, dict):
-            first_name_data = first_name_result.get("first_name")
-            if first_name_data and "rank" in first_name_data:
-                first_name_rank = first_name_data["rank"].get(
-                    "United States", 0
-                )
-    else:
-        first_name_rank = None
-    if isinstance(last_name, str):
-        last_name_result = nd.search(last_name)
-        if last_name_result and isinstance(last_name_result, dict):
-            last_name_data = last_name_result.get("last_name")
-            if last_name_data and "rank" in last_name_data:
-                last_name_rank = last_name_data["rank"].get("United States", 0)
-    else:
-        last_name_rank = None
-    return [first_name_rank, last_name_rank]
-
-
 def convert_duplicates_to_dict(df: pd.DataFrame) -> None:
     """For each uuid, maps it to all other uuids for which it has been deemed a
     match.
@@ -537,6 +308,7 @@ def cleaning_company_column(company_entry: str) -> str:
         standardized for retired, self employed, and unemployed,
         or original string if no match or empty string
 
+    Sample Usage:
     >>> cleaning_company_column("Retireed")
     'Retired'
     >>> cleaning_company_column("self")
@@ -592,6 +364,7 @@ def standardize_corp_names(company_name: str) -> str:
     Returns:
         standardized company name
 
+    Sample Usage:
     >>> standardize_corp_names('MI BEER WINE WHOLESALERS ASSOC')
     'MI BEER WINE WHOLESALERS ASSOCIATION'
 
@@ -617,6 +390,10 @@ def standardize_corp_names(company_name: str) -> str:
 def get_address_number_from_address_line_1(address_line_1: str) -> str:
     """Given an address line 1, return the building number or po box
 
+    Uses the usaddress libray which splits an address string into components,
+    and labels each component.
+    https://usaddress.readthedocs.io/en/latest/
+
     Args:
         address_line_1: either street information or PO box
     Returns:
@@ -655,6 +432,11 @@ def splink_dedupe(
     individuals_settings, indivduals_blocking, organizations_settings,
     organizations_blocking
 
+    Uses the splink library which employs probabilistic matching for
+    record linkage
+    https://moj-analytical-services.github.io/splink/index.html
+
+
     Args:
         df: dataframe
         settings: configuration settings
diff --git a/utils/tests/test_linkage.py b/utils/tests/test_linkage.py
index 4a5f73f..d96339d 100644
--- a/utils/tests/test_linkage.py
+++ b/utils/tests/test_linkage.py
@@ -1,115 +1,13 @@
-import numpy as np
 import pandas as pd
 import pytest
 
 from utils.constants import BASE_FILEPATH
-from utils.linkage import (
-    calculate_row_similarity,
-    calculate_string_similarity,
-    deduplicate_perfect_matches,
-    row_matches,
-)
+from utils.linkage import deduplicate_perfect_matches
 
 """
 Module for testing functions in linkage.py
 """
 
-# Creating a test for calculate_row_similarity and row_matches
-
-# to put in data:
-d = {
-    "name": ["bob von rosevich", "anantarya smith", "bob j vonrosevich"],
-    "address": [
-        "3 Blue Drive, Chicago",
-        "4 Blue Drive, Chicago",
-        "8 Fancy Way, Chicago",
-    ],
-}
-
-test_df = pd.DataFrame(data=d)
-
-
-@pytest.fixture
-def row_similarity_scen_1():
-    return test_df
-
-
-@pytest.fixture
-def row_similarity_scen_2():
-    return test_df
-
-
-def test_row_similarity_scen_1(row_similarity_scen_1):
-    wrong = calculate_row_similarity(
-        row_similarity_scen_1.iloc[[0]],
-        row_similarity_scen_1.iloc[[1]],
-        np.array([0.8, 0.2]),
-        calculate_string_similarity,
-    )
-    right = calculate_row_similarity(
-        row_similarity_scen_1.iloc[[0]],
-        row_similarity_scen_1.iloc[[2]],
-        np.array([0.8, 0.2]),
-        calculate_string_similarity,
-    )
-
-    assert right > wrong
-
-
-def test_row_similarity_scen_2(row_similarity_scen_2):
-    wrong = calculate_row_similarity(
-        row_similarity_scen_2.iloc[[0]],
-        row_similarity_scen_2.iloc[[1]],
-        np.array([0.2, 0.8]),
-        calculate_string_similarity,
-    )
-    right = calculate_row_similarity(
-        row_similarity_scen_2.iloc[[0]],
-        row_similarity_scen_2.iloc[[2]],
-        np.array([0.2, 0.8]),
-        calculate_string_similarity,
-    )
-
-    assert right < wrong
-
-
-d2 = {
-    "name": [
-        "bob von rosevich",
-        "anantarya smith",
-        "bob j vonrosevich",
-        "missy elliot",
-        "mr johnson",
-        "quarantin directino",
-        "missy eliot",
-        "joseph johnson",
-    ],
-    "address": [
-        "3 Blue Drive, Chicago",
-        "4 Blue Drive, Chicago",
-        "8 Fancy Way, Chicago",
-        "8 Fancy Way, Evanston",
-        "17 Regular Road, Chicago",
-        "42 Hollywood Boulevard, Chicago",
-        "8 Fancy Way, Evanston",
-        "17 Regular Road, Chicago",
-    ],
-}
-test_df2 = pd.DataFrame(data=d2)
-
-
-@pytest.fixture
-def row_match_scen1():
-    return test_df2
-
-
-def test_row_matches(row_match_scen1):
-    res = row_matches(
-        row_match_scen1, np.array([0.8, 0.2]), 0.9, calculate_string_similarity
-    )
-
-    assert res == {0: [2], 1: [], 2: [], 3: [6], 4: [], 5: [], 6: [], 7: []}
-
 
 # Test for dedupe function
 @pytest.fixture