3
3
from utils .constants import c_org_names , f_companies , f_org_names
4
4
5
5
6
- def classify_wrapper (individuals_df , organizations_df ):
7
- """Wrapper for classificaiton in linkage pipeline
6
+ def classify_wrapper (
7
+ individuals_df : pd .DataFrame , organizations_df : pd .DataFrame
8
+ ):
9
+ """Wrapper for classification in linkage pipeline
8
10
9
11
Initialize the classify column in both dataframes and
10
12
call sub-functions classifying individuals and organizations
11
13
12
- Args: individuals_df: cleaned and deduplicated dataframe of individuals
13
- organizations_df: cleaned and deduplicated dataframe of organizations
14
+ Args:
15
+ individuals_df: cleaned and deduplicated dataframe of individuals
16
+ organizations_df: cleaned and deduplicated dataframe of organizations
17
+
18
+ Returns:
19
+ individuals and organizations datfarames with a new
20
+ 'classification' column containing 'neutral', 'f', or 'c'.
21
+ 'neutral' status is the default for all entities, and those tagged
22
+ as 'neutral' are entities which we could not confidently identify as
23
+ either fossil fuel or clean energy organizations or affiliates.
24
+ Classification is very conservative, and we are very confident that
25
+ entities classified as one group or another are related to them.
14
26
15
- Returns: individuals and organizations datfarames with a new
16
- 'classification' column containing 'neutral', 'f', or 'c'
17
27
"""
18
28
19
29
individuals_df ["classification" ] = "neutral"
@@ -25,14 +35,24 @@ def classify_wrapper(individuals_df, organizations_df):
25
35
return classified_individuals , classified_orgs
26
36
27
37
28
- def matcher (df , substring , column , category ):
38
+ def matcher (df : pd . DataFrame , substring : str , column : str , category : str ):
29
39
"""Applies a label to the classification column based on substrings
30
40
31
41
We run through a given column containing strings in the dataframe. We
32
42
seek out rows containing substrings, and apply a certain label to
33
43
the classification column. We initialize using the 'neutral' label and
34
44
use the 'f' and 'c' labels to denote fossil fuel and clean energy
35
45
entities respectively.
46
+
47
+ Args:
48
+ df: a pandas dataframe
49
+ substring: the string to search for
50
+ column: the column name in which to search
51
+ category: the category to assign the row, such as 'f' 'c' or 'neutral'
52
+
53
+ Returns:
54
+ A pandas dataframe in which rows matching the substring conditions in
55
+ a certain column are marked with the appropriate category
36
56
"""
37
57
38
58
bool_series = df [column ].str .contains (substring , na = False )
@@ -42,12 +62,18 @@ def matcher(df, substring, column, category):
42
62
return df
43
63
44
64
45
- def classify_individuals (individuals_df ):
65
+ def classify_individuals (individuals_df : pd . DataFrame ):
46
66
"""Part of the classification pipeline
47
67
48
- We apply the matcher function to the individuals dataframe
49
- repeatedly, using a variety of substrings to identify the
50
- employees of fossil fuel companies.
68
+ We check if individuals work for a known fossil fuel company
69
+ and categorize them using the matcher() function.
70
+
71
+ Args:
72
+ individuals_df: a dataframe containing deduplicated
73
+ standardized individuals data
74
+
75
+ Returns:
76
+ an individuals dataframe updated with the fossil fuels category
51
77
"""
52
78
53
79
for i in f_companies :
@@ -56,12 +82,20 @@ def classify_individuals(individuals_df):
56
82
return individuals_df
57
83
58
84
59
- def classify_orgs (organizations_df ):
85
+ def classify_orgs (organizations_df : pd . DataFrame ):
60
86
"""Part of the classification pipeline
61
87
62
88
We apply the matcher function to the organizations dataframe
63
89
repeatedly, using a variety of substrings to identify fossil
64
90
fuel and clean energy companies.
91
+
92
+ Args:
93
+ organizations_df: a dataframe containing deduplicated
94
+ standardized organizations data
95
+
96
+ Returns:
97
+ an organizations dataframe updated with the fossil fuels
98
+ and clean energy category
65
99
"""
66
100
67
101
for i in f_org_names :
@@ -71,69 +105,3 @@ def classify_orgs(organizations_df):
71
105
organizations_df = matcher (organizations_df , i , "name" , "c" )
72
106
73
107
return organizations_df
74
-
75
-
76
- inds_list = []
77
-
78
- # a list of individual names
79
-
80
-
81
- def similarity_calculator (
82
- df : pd .DataFrame , subject : str , n : int , comparison_func
83
- ) -> pd .DataFrame :
84
- """Find best matches to a subject name in a pandas dataframe
85
-
86
- For a given individual or organization, the subject, we search through the
87
- 'name'column of a dataframe, select the n highest matches according to a
88
- selected comparison function, and return those as a dataframe. This is meant
89
- to be used manually to search for matches. For quick automated processing, see
90
- automated_classifier().
91
-
92
- Note that the comparison function must take in two inputs, both strings, and
93
- output a percentage match
94
- """
95
-
96
- similarities_df = df .copy ()
97
-
98
- similarities = similarities_df ["name" ].apply (
99
- lambda x : comparison_func (x , subject )
100
- )
101
-
102
- similarities_df ["similarities" ] = similarities
103
-
104
- top_n_matches = similarities_df .sort_values (
105
- by = ["similarities" ], ascending = False
106
- )[0 :n ]
107
-
108
- return top_n_matches
109
-
110
-
111
- def automated_classifier (
112
- df : pd .DataFrame , subjects_dict : dict , threshold : float , comparison_func
113
- ):
114
- """Using similarity_calculator, classify entities automatically
115
-
116
- Feeding a dictionary of names and the associated statuses, we compare
117
- the string matches and, if they exceed a certain threshold, classify
118
- them as belonging to some group specified in the subjects dictionary.
119
- """
120
-
121
- similarities_df = df .copy ()
122
-
123
- for subject in subjects_dict :
124
- similarities = similarities_df ["name" ].apply (
125
- lambda x , sub = subject : comparison_func (x , sub )
126
- )
127
- matches = similarities >= threshold
128
-
129
- status = subjects_dict [subject ]
130
-
131
- similarities_df ["classification" ] = pd .Series (matches ).apply (
132
- lambda x , stat = status : stat if x else "neutral"
133
- )
134
-
135
- return similarities_df
136
-
137
- # we can use the indices and/or select manually, just add a new
138
- # column to the subjects table
139
- # that marks fossil fuels, green energy, or neither
0 commit comments