Skip to content

Commit 863cfab

Browse files
authored
Merge pull request #36 from dsi-clinic/update_classify
Update classify
2 parents ecd73d0 + feda102 commit 863cfab

File tree

2 files changed

+92
-78
lines changed

2 files changed

+92
-78
lines changed

utils/classify.py

+46-78
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,27 @@
33
from utils.constants import c_org_names, f_companies, f_org_names
44

55

6-
def classify_wrapper(individuals_df, organizations_df):
7-
"""Wrapper for classificaiton in linkage pipeline
6+
def classify_wrapper(
7+
individuals_df: pd.DataFrame, organizations_df: pd.DataFrame
8+
):
9+
"""Wrapper for classification in linkage pipeline
810
911
Initialize the classify column in both dataframes and
1012
call sub-functions classifying individuals and organizations
1113
12-
Args: individuals_df: cleaned and deduplicated dataframe of individuals
13-
organizations_df: cleaned and deduplicated dataframe of organizations
14+
Args:
15+
individuals_df: cleaned and deduplicated dataframe of individuals
16+
organizations_df: cleaned and deduplicated dataframe of organizations
17+
18+
Returns:
19+
individuals and organizations datfarames with a new
20+
'classification' column containing 'neutral', 'f', or 'c'.
21+
'neutral' status is the default for all entities, and those tagged
22+
as 'neutral' are entities which we could not confidently identify as
23+
either fossil fuel or clean energy organizations or affiliates.
24+
Classification is very conservative, and we are very confident that
25+
entities classified as one group or another are related to them.
1426
15-
Returns: individuals and organizations datfarames with a new
16-
'classification' column containing 'neutral', 'f', or 'c'
1727
"""
1828

1929
individuals_df["classification"] = "neutral"
@@ -25,14 +35,24 @@ def classify_wrapper(individuals_df, organizations_df):
2535
return classified_individuals, classified_orgs
2636

2737

28-
def matcher(df, substring, column, category):
38+
def matcher(df: pd.DataFrame, substring: str, column: str, category: str):
2939
"""Applies a label to the classification column based on substrings
3040
3141
We run through a given column containing strings in the dataframe. We
3242
seek out rows containing substrings, and apply a certain label to
3343
the classification column. We initialize using the 'neutral' label and
3444
use the 'f' and 'c' labels to denote fossil fuel and clean energy
3545
entities respectively.
46+
47+
Args:
48+
df: a pandas dataframe
49+
substring: the string to search for
50+
column: the column name in which to search
51+
category: the category to assign the row, such as 'f' 'c' or 'neutral'
52+
53+
Returns:
54+
A pandas dataframe in which rows matching the substring conditions in
55+
a certain column are marked with the appropriate category
3656
"""
3757

3858
bool_series = df[column].str.contains(substring, na=False)
@@ -42,12 +62,18 @@ def matcher(df, substring, column, category):
4262
return df
4363

4464

45-
def classify_individuals(individuals_df):
65+
def classify_individuals(individuals_df: pd.DataFrame):
4666
"""Part of the classification pipeline
4767
48-
We apply the matcher function to the individuals dataframe
49-
repeatedly, using a variety of substrings to identify the
50-
employees of fossil fuel companies.
68+
We check if individuals work for a known fossil fuel company
69+
and categorize them using the matcher() function.
70+
71+
Args:
72+
individuals_df: a dataframe containing deduplicated
73+
standardized individuals data
74+
75+
Returns:
76+
an individuals dataframe updated with the fossil fuels category
5177
"""
5278

5379
for i in f_companies:
@@ -56,12 +82,20 @@ def classify_individuals(individuals_df):
5682
return individuals_df
5783

5884

59-
def classify_orgs(organizations_df):
85+
def classify_orgs(organizations_df: pd.DataFrame):
6086
"""Part of the classification pipeline
6187
6288
We apply the matcher function to the organizations dataframe
6389
repeatedly, using a variety of substrings to identify fossil
6490
fuel and clean energy companies.
91+
92+
Args:
93+
organizations_df: a dataframe containing deduplicated
94+
standardized organizations data
95+
96+
Returns:
97+
an organizations dataframe updated with the fossil fuels
98+
and clean energy category
6599
"""
66100

67101
for i in f_org_names:
@@ -71,69 +105,3 @@ def classify_orgs(organizations_df):
71105
organizations_df = matcher(organizations_df, i, "name", "c")
72106

73107
return organizations_df
74-
75-
76-
inds_list = []
77-
78-
# a list of individual names
79-
80-
81-
def similarity_calculator(
82-
df: pd.DataFrame, subject: str, n: int, comparison_func
83-
) -> pd.DataFrame:
84-
"""Find best matches to a subject name in a pandas dataframe
85-
86-
For a given individual or organization, the subject, we search through the
87-
'name'column of a dataframe, select the n highest matches according to a
88-
selected comparison function, and return those as a dataframe. This is meant
89-
to be used manually to search for matches. For quick automated processing, see
90-
automated_classifier().
91-
92-
Note that the comparison function must take in two inputs, both strings, and
93-
output a percentage match
94-
"""
95-
96-
similarities_df = df.copy()
97-
98-
similarities = similarities_df["name"].apply(
99-
lambda x: comparison_func(x, subject)
100-
)
101-
102-
similarities_df["similarities"] = similarities
103-
104-
top_n_matches = similarities_df.sort_values(
105-
by=["similarities"], ascending=False
106-
)[0:n]
107-
108-
return top_n_matches
109-
110-
111-
def automated_classifier(
112-
df: pd.DataFrame, subjects_dict: dict, threshold: float, comparison_func
113-
):
114-
"""Using similarity_calculator, classify entities automatically
115-
116-
Feeding a dictionary of names and the associated statuses, we compare
117-
the string matches and, if they exceed a certain threshold, classify
118-
them as belonging to some group specified in the subjects dictionary.
119-
"""
120-
121-
similarities_df = df.copy()
122-
123-
for subject in subjects_dict:
124-
similarities = similarities_df["name"].apply(
125-
lambda x, sub=subject: comparison_func(x, sub)
126-
)
127-
matches = similarities >= threshold
128-
129-
status = subjects_dict[subject]
130-
131-
similarities_df["classification"] = pd.Series(matches).apply(
132-
lambda x, stat=status: stat if x else "neutral"
133-
)
134-
135-
return similarities_df
136-
137-
# we can use the indices and/or select manually, just add a new
138-
# column to the subjects table
139-
# that marks fossil fuels, green energy, or neither

utils/tests/test_classify.py

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import numpy as np
2+
import pandas as pd
3+
import pytest
4+
5+
from utils.classify import matcher
6+
7+
d = {
8+
"name": [
9+
"bob von rosevich",
10+
"anantarya smith",
11+
"bob j vonrosevich",
12+
"missy elliot",
13+
"mr johnson",
14+
"quarantin directino",
15+
"missy eliot",
16+
"joseph johnson",
17+
],
18+
"address": [
19+
"3 Blue Drive, Chicago",
20+
"4 Blue Drive, Chicago",
21+
"8 Fancy Way, Chicago",
22+
"8 Fancy Way, Evanston",
23+
"17 Regular Road, Chicago",
24+
"42 Hollywood Boulevard, Chicago",
25+
"8 Fancy Way, Evanston",
26+
"17 Regular Road, Chicago",
27+
],
28+
}
29+
30+
test_df = pd.DataFrame(data=d)
31+
32+
test_df["classification"] = "neutral"
33+
34+
35+
@pytest.fixture
36+
def matcher_scen_1():
37+
return test_df
38+
39+
40+
def test_matcher_scen_1(matcher_scen_1):
41+
matcher(matcher_scen_1, "Fancy", "address", "f")
42+
res = test_df[test_df["classification"] == "f"]["name"].values
43+
44+
assert np.all(
45+
res == np.array(["bob j vonrosevich", "missy elliot", "missy eliot"])
46+
)

0 commit comments

Comments
 (0)