Skip to content

Commit 3c2005f

Browse files
authored
Merge pull request #29 from dsi-clinic/networkx_record_linkage
update on function to add nodes and their attributes to graph
2 parents 863cfab + 0f7d07e commit 3c2005f

File tree

4 files changed

+233
-3
lines changed

4 files changed

+233
-3
lines changed

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,4 @@ run-notebooks:
3434
#still waiting on linkage_pipeline completion to get this into final shape
3535

3636
output network_graph: all_individuals.csv all_organizations.csv all_transactions.csv
37-
python linkage_pipeline.py
37+
python linkage_pipeline.py

README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@ If you prefer to develop inside a container with VS Code then do the following s
4545

4646
### Record Linkage and Network Pipeline
4747
1. Save the standardized tables "complete_individuals_table.csv", "complete_organizations_table.csv", and "complete_transactions_table.csv" (collected from the above pipeline or data from the project's Google Drive) in the following format: repo_root / "output" / "file"
48-
2. **UPDATE:** Run the pipeline by calling ```make run-linkage-pipeline```. This pipeline will perform conservative record linkage, attempt to classify entities as neutral, fossil fuels, or clean energy, and an interactive network visual
49-
3. The pipeline will output the deduplicated tables saved as "cleaned_individuals_table.csv", "cleaned_organizations_table.csv", and "cleaned_transactions_table.csv". A mapping file, "deduplicated_UUIDs" tracks the UUIDs designated as duplicates.
48+
2. **UPDATE:** Run the pipeline by calling ```make run-linkage-pipeline```. This pipeline will perform conservative record linkage, attempt to classify entities as neutral, fossil fuels, or clean energy, convert the standardized tables into a NetworkX Graph, and show an interactive network visual.
49+
3. The pipeline will output the deduplicated tables saved as "cleaned_individuals_table.csv", "cleaned_organizations_table.csv", and "cleaned_transactions_table.csv". A mapping file, "deduplicated_UUIDs" tracks the UUIDs designated as duplicates. The pipeline will also output "Network Graph Node Data", which is the NetworkX Graph object converted into an adjecency list.
5050

5151
## Repository Structure
5252

output/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
# Output README
22
---
3+
'deduplicated_UUIDs.csv' : Following record linkage work in the record_linkage pipeline, this file stores all the original uuids, and indicates the uuids to which the deduplicated uuids have been matched to.

utils/network.py

+229
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
import networkx as nx
2+
import pandas as pd
3+
import plotly.graph_objects as go
4+
5+
6+
def name_identifier(uuid: str, dfs: list[pd.DataFrame]) -> str:
7+
"""Returns the name of the entity given the entity's uuid
8+
9+
Args:
10+
uuid: the uuid of the entity
11+
List of dfs: dataframes that have a uuid column, and an 'name' or
12+
'full_name' column
13+
Return:
14+
The entity's name
15+
"""
16+
for df in dfs:
17+
if "name" in df.columns:
18+
name_in_org = df.loc[df["id"] == uuid]
19+
if len(name_in_org) > 0:
20+
return name_in_org.iloc[0]["name"]
21+
22+
if "full_name" in df.columns:
23+
name_in_ind = df.loc[df["id"] == uuid]
24+
if len(name_in_ind) > 0:
25+
return name_in_ind.iloc[0]["full_name"]
26+
return None
27+
28+
29+
def combine_datasets_for_network_graph(dfs: list[pd.DataFrame]) -> pd.DataFrame:
30+
"""Combines the 3 dataframes into a single dataframe to create a graph
31+
32+
Given 3 dataframes, the func adds a 'recipient_name' column in the
33+
transactions df, merges the dfs together to record transaction info between
34+
entities, then concatenates the dfs into a final df of the merged
35+
transactions and entity dfs.
36+
37+
Args:
38+
list of dataframes in the order: [inds_df, orgs_df, transactions_df]
39+
Transactions dataframe with column: 'recipient_id'
40+
Individuals dataframe with column: 'full_name'
41+
Organizations dataframe with column: 'name'
42+
43+
Returns
44+
A merged dataframe with aggregate contribution amounts between entitites
45+
"""
46+
47+
inds_df, orgs_df, transactions_df = dfs
48+
49+
# first update the transactions df to have a recipient name tied to id
50+
transactions_df["recipient_name"] = transactions_df["recipient_id"].apply(
51+
name_identifier, args=([orgs_df, inds_df],)
52+
)
53+
54+
# next, merge the inds_df and orgs_df ids with the transactions_df donor_id
55+
inds_trans_df = pd.merge(
56+
inds_df, transactions_df, how="left", left_on="id", right_on="donor_id"
57+
)
58+
inds_trans_df = inds_trans_df.dropna(subset=["amount"])
59+
orgs_trans_df = pd.merge(
60+
orgs_df, transactions_df, how="left", left_on="id", right_on="donor_id"
61+
)
62+
orgs_trans_df = orgs_trans_df.dropna(subset=["amount"])
63+
orgs_trans_df = orgs_trans_df.rename(columns={"name": "full_name"})
64+
65+
# concatenated the merged dfs
66+
merged_df = pd.concat([orgs_trans_df, inds_trans_df])
67+
68+
# lastly, create the final dataframe with aggregated attributes
69+
attribute_cols = merged_df.columns.difference(
70+
["donor_id", "recipient_id", "full_name", "recipient_name"]
71+
)
72+
agg_functions = {
73+
col: "sum" if col == "amount" else "first" for col in attribute_cols
74+
}
75+
aggreg_df = (
76+
merged_df.groupby(
77+
["donor_id", "recipient_id", "full_name", "recipient_name"]
78+
)
79+
.agg(agg_functions)
80+
.reset_index()
81+
)
82+
aggreg_df = aggreg_df.drop(["id"], axis=1)
83+
return aggreg_df
84+
85+
86+
def create_network_graph(df: pd.DataFrame) -> nx.MultiDiGraph:
87+
"""Takes in a dataframe and generates a MultiDiGraph where the nodes are
88+
entity names, and the rest of the dataframe columns make the node attributes
89+
90+
Args:
91+
df: a pandas dataframe with merged information from the inds, orgs, &
92+
transactions dataframes
93+
94+
Returns:
95+
A Networkx MultiDiGraph with nodes and edges
96+
"""
97+
G = nx.MultiDiGraph()
98+
edge_columns = [
99+
"office_sought",
100+
"purpose",
101+
"transaction_type",
102+
"year",
103+
"transaction_id",
104+
"donor_office",
105+
"amount",
106+
]
107+
108+
for _, row in df.iterrows():
109+
# add node attributes based on the columns relevant to the entity
110+
G.add_node(
111+
row["full_name"],
112+
**row[df.columns.difference(edge_columns)].dropna().to_dict(),
113+
)
114+
# add the recipient as a node
115+
G.add_node(row["recipient_name"], classification="neutral")
116+
117+
# add the edge attributes between two nodes
118+
edge_attributes = row[edge_columns].dropna().to_dict()
119+
G.add_edge(row["full_name"], row["recipient_name"], **edge_attributes)
120+
121+
return G
122+
123+
124+
def plot_network_graph(G: nx.MultiDiGraph):
125+
"""Given a networkX Graph, creates a plotly visualization of the nodes and
126+
edges
127+
128+
Args:
129+
A networkX MultiDiGraph with edges including the attribute 'amount'
130+
131+
Returns: None. Creates a plotly graph
132+
"""
133+
edge_trace = go.Scatter(
134+
x=(),
135+
y=(),
136+
line=dict(color="#888", width=1.5),
137+
hoverinfo="text",
138+
mode="lines+markers",
139+
)
140+
hovertext = []
141+
pos = nx.spring_layout(G)
142+
143+
for edge in G.edges(data=True):
144+
source = edge[0]
145+
target = edge[1]
146+
hovertext.append(f"Amount: {edge[2]['amount']:.2f}")
147+
# Adding coordinates of source and target nodes to edge_trace
148+
edge_trace["x"] += (
149+
pos[source][0],
150+
pos[target][0],
151+
None,
152+
) # None creates a gap between line segments
153+
edge_trace["y"] += (pos[source][1], pos[target][1], None)
154+
155+
edge_trace["hovertext"] = hovertext
156+
157+
# Define arrow symbol for edges
158+
edge_trace["marker"] = dict(
159+
symbol="arrow", color="#888", size=10, angleref="previous"
160+
)
161+
162+
node_trace = go.Scatter(
163+
x=[],
164+
y=[],
165+
text=[],
166+
mode="markers",
167+
hoverinfo="text",
168+
marker=dict(showscale=True, colorscale="YlGnBu", size=10),
169+
)
170+
node_trace["marker"]["color"] = []
171+
172+
for node in G.nodes():
173+
node_info = f"Name: {node}<br>"
174+
for key, value in G.nodes[node].items():
175+
node_info += f"{key}: {value}<br>"
176+
node_trace["text"] += tuple([node_info])
177+
# Get the classification value for the node
178+
classification = G.nodes[node].get("classification", "neutral")
179+
# Assign a color based on the classification value
180+
if classification == "c":
181+
color = "blue"
182+
elif classification == "f":
183+
color = "red"
184+
else:
185+
color = "green" # Default color for unknown classification
186+
node_trace["marker"]["color"] += tuple([color])
187+
188+
# Add node positions to the trace
189+
node_trace["x"] += tuple([pos[node][0]])
190+
node_trace["y"] += tuple([pos[node][1]])
191+
192+
# Define layout settings
193+
layout = go.Layout(
194+
title="Network Graph Indicating Campaign Contributions from 2018-2022",
195+
titlefont=dict(size=16),
196+
showlegend=True,
197+
hovermode="closest",
198+
margin=dict(b=20, l=5, r=5, t=40),
199+
xaxis=dict(showgrid=True, zeroline=True, showticklabels=False),
200+
yaxis=dict(showgrid=True, zeroline=True, showticklabels=False),
201+
)
202+
203+
fig = go.Figure(data=[edge_trace, node_trace], layout=layout)
204+
fig.show()
205+
206+
207+
def construct_network_graph(
208+
start_year: int, end_year: int, dfs: list[pd.DataFrame]
209+
):
210+
"""Runs the network construction pipeline starting from 3 dataframes
211+
212+
Args:
213+
start_year & end_year: the range of the desired data
214+
dfs: dataframes in the order: inds_df, orgs_df, transactions_df
215+
216+
Returns:
217+
"""
218+
inds_df, orgs_df, transactions_df = dfs
219+
transactions_df = transactions_df.loc[
220+
(transactions_df.year >= start_year)
221+
& (transactions_df.year <= end_year)
222+
]
223+
224+
aggreg_df = combine_datasets_for_network_graph(
225+
[inds_df, orgs_df, transactions_df]
226+
)
227+
G = create_network_graph(aggreg_df)
228+
plot_network_graph(G)
229+
nx.write_adjlist(G, "Network Graph Node Data")

0 commit comments

Comments
 (0)