Merge pull request #36 from dsi-clinic/update_classify

averyschoen · web-flow · commit 863cfab9e76e · 2024-03-04T20:20:58.000-06:00
Update classify
diff --git a/utils/classify.py b/utils/classify.py
@@ -3,17 +3,27 @@
 from utils.constants import c_org_names, f_companies, f_org_names
 
 
-def classify_wrapper(individuals_df, organizations_df):
-    """Wrapper for classificaiton in linkage pipeline
+def classify_wrapper(
+    individuals_df: pd.DataFrame, organizations_df: pd.DataFrame
+):
+    """Wrapper for classification in linkage pipeline
 
     Initialize the classify column in both dataframes and
     call sub-functions classifying individuals and organizations
 
-    Args: individuals_df: cleaned and deduplicated dataframe of individuals
-    organizations_df: cleaned and deduplicated dataframe of organizations
+    Args:
+        individuals_df: cleaned and deduplicated dataframe of individuals
+        organizations_df: cleaned and deduplicated dataframe of organizations
+
+    Returns:
+        individuals and organizations datfarames with a new
+        'classification' column containing 'neutral', 'f', or 'c'.
+        'neutral' status is the default for all entities, and those tagged
+        as 'neutral' are entities which we could not confidently identify as
+        either fossil fuel or clean energy organizations or affiliates.
+        Classification is very conservative, and we are very confident that
+        entities classified as one group or another are related to them.
 
-    Returns: individuals and organizations datfarames with a new
-    'classification' column containing 'neutral', 'f', or 'c'
     """
 
     individuals_df["classification"] = "neutral"
@@ -25,14 +35,24 @@ def classify_wrapper(individuals_df, organizations_df):
     return classified_individuals, classified_orgs
 
 
-def matcher(df, substring, column, category):
+def matcher(df: pd.DataFrame, substring: str, column: str, category: str):
     """Applies a label to the classification column based on substrings
 
     We run through a given column containing strings in the dataframe. We
     seek out rows containing substrings, and apply a certain label to
     the classification column. We initialize using the 'neutral' label and
     use the 'f' and 'c' labels to denote fossil fuel and clean energy
     entities respectively.
+
+    Args:
+        df: a pandas dataframe
+        substring: the string to search for
+        column: the column name in which to search
+        category: the category to assign the row, such as 'f' 'c' or 'neutral'
+
+    Returns:
+        A pandas dataframe in which rows matching the substring conditions in
+        a certain column are marked with the appropriate category
     """
 
     bool_series = df[column].str.contains(substring, na=False)
@@ -42,12 +62,18 @@ def matcher(df, substring, column, category):
     return df
 
 
-def classify_individuals(individuals_df):
+def classify_individuals(individuals_df: pd.DataFrame):
     """Part of the classification pipeline
 
-    We apply the matcher function to the individuals dataframe
-    repeatedly, using a variety of substrings to identify the
-    employees of fossil fuel companies.
+    We check if individuals work for a known fossil fuel company
+    and categorize them using the matcher() function.
+
+    Args:
+        individuals_df: a dataframe containing deduplicated
+        standardized individuals data
+
+    Returns:
+        an individuals dataframe updated with the fossil fuels category
     """
 
     for i in f_companies:
@@ -56,12 +82,20 @@ def classify_individuals(individuals_df):
     return individuals_df
 
 
-def classify_orgs(organizations_df):
+def classify_orgs(organizations_df: pd.DataFrame):
     """Part of the classification pipeline
 
     We apply the matcher function to the organizations dataframe
     repeatedly, using a variety of substrings to identify fossil
     fuel and clean energy companies.
+
+    Args:
+        organizations_df: a dataframe containing deduplicated
+        standardized organizations data
+
+    Returns:
+        an organizations dataframe updated with the fossil fuels
+        and clean energy category
     """
 
     for i in f_org_names:
@@ -71,69 +105,3 @@ def classify_orgs(organizations_df):
         organizations_df = matcher(organizations_df, i, "name", "c")
 
     return organizations_df
-
-
-inds_list = []
-
-# a list of individual names
-
-
-def similarity_calculator(
-    df: pd.DataFrame, subject: str, n: int, comparison_func
-) -> pd.DataFrame:
-    """Find best matches to a subject name in a pandas dataframe
-
-    For a given individual or organization, the subject, we search through the
-    'name'column of a dataframe, select the n highest matches according to a
-    selected comparison function, and return those as a dataframe. This is meant
-    to be used manually to search for matches. For quick automated processing, see
-    automated_classifier().
-
-    Note that the comparison function must take in two inputs, both strings, and
-    output a percentage match
-    """
-
-    similarities_df = df.copy()
-
-    similarities = similarities_df["name"].apply(
-        lambda x: comparison_func(x, subject)
-    )
-
-    similarities_df["similarities"] = similarities
-
-    top_n_matches = similarities_df.sort_values(
-        by=["similarities"], ascending=False
-    )[0:n]
-
-    return top_n_matches
-
-
-def automated_classifier(
-    df: pd.DataFrame, subjects_dict: dict, threshold: float, comparison_func
-):
-    """Using similarity_calculator, classify entities automatically
-
-    Feeding a dictionary of names and the associated statuses, we compare
-    the string matches and, if they exceed a certain threshold, classify
-    them as belonging to some group specified in the subjects dictionary.
-    """
-
-    similarities_df = df.copy()
-
-    for subject in subjects_dict:
-        similarities = similarities_df["name"].apply(
-            lambda x, sub=subject: comparison_func(x, sub)
-        )
-        matches = similarities >= threshold
-
-        status = subjects_dict[subject]
-
-        similarities_df["classification"] = pd.Series(matches).apply(
-            lambda x, stat=status: stat if x else "neutral"
-        )
-
-    return similarities_df
-
-    # we can use the indices and/or select manually, just add a new
-    # column to the subjects table
-    # that marks fossil fuels, green energy, or neither
diff --git a/utils/tests/test_classify.py b/utils/tests/test_classify.py
@@ -0,0 +1,46 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from utils.classify import matcher
+
+d = {
+    "name": [
+        "bob von rosevich",
+        "anantarya smith",
+        "bob j vonrosevich",
+        "missy elliot",
+        "mr johnson",
+        "quarantin directino",
+        "missy eliot",
+        "joseph johnson",
+    ],
+    "address": [
+        "3 Blue Drive, Chicago",
+        "4 Blue Drive, Chicago",
+        "8 Fancy Way, Chicago",
+        "8 Fancy Way, Evanston",
+        "17 Regular Road, Chicago",
+        "42 Hollywood Boulevard, Chicago",
+        "8 Fancy Way, Evanston",
+        "17 Regular Road, Chicago",
+    ],
+}
+
+test_df = pd.DataFrame(data=d)
+
+test_df["classification"] = "neutral"
+
+
+@pytest.fixture
+def matcher_scen_1():
+    return test_df
+
+
+def test_matcher_scen_1(matcher_scen_1):
+    matcher(matcher_scen_1, "Fancy", "address", "f")
+    res = test_df[test_df["classification"] == "f"]["name"].values
+
+    assert np.all(
+        res == np.array(["bob j vonrosevich", "missy elliot", "missy eliot"])
+    )