|
| 1 | +import networkx as nx |
| 2 | +import pandas as pd |
| 3 | +import plotly.graph_objects as go |
| 4 | + |
| 5 | + |
| 6 | +def name_identifier(uuid: str, dfs: list[pd.DataFrame]) -> str: |
| 7 | + """Returns the name of the entity given the entity's uuid |
| 8 | +
|
| 9 | + Args: |
| 10 | + uuid: the uuid of the entity |
| 11 | + List of dfs: dataframes that have a uuid column, and an 'name' or |
| 12 | + 'full_name' column |
| 13 | + Return: |
| 14 | + The entity's name |
| 15 | + """ |
| 16 | + for df in dfs: |
| 17 | + if "name" in df.columns: |
| 18 | + name_in_org = df.loc[df["id"] == uuid] |
| 19 | + if len(name_in_org) > 0: |
| 20 | + return name_in_org.iloc[0]["name"] |
| 21 | + |
| 22 | + if "full_name" in df.columns: |
| 23 | + name_in_ind = df.loc[df["id"] == uuid] |
| 24 | + if len(name_in_ind) > 0: |
| 25 | + return name_in_ind.iloc[0]["full_name"] |
| 26 | + return None |
| 27 | + |
| 28 | + |
| 29 | +def combine_datasets_for_network_graph(dfs: list[pd.DataFrame]) -> pd.DataFrame: |
| 30 | + """Combines the 3 dataframes into a single dataframe to create a graph |
| 31 | +
|
| 32 | + Given 3 dataframes, the func adds a 'recipient_name' column in the |
| 33 | + transactions df, merges the dfs together to record transaction info between |
| 34 | + entities, then concatenates the dfs into a final df of the merged |
| 35 | + transactions and entity dfs. |
| 36 | +
|
| 37 | + Args: |
| 38 | + list of dataframes in the order: [inds_df, orgs_df, transactions_df] |
| 39 | + Transactions dataframe with column: 'recipient_id' |
| 40 | + Individuals dataframe with column: 'full_name' |
| 41 | + Organizations dataframe with column: 'name' |
| 42 | +
|
| 43 | + Returns |
| 44 | + A merged dataframe with aggregate contribution amounts between entitites |
| 45 | + """ |
| 46 | + |
| 47 | + inds_df, orgs_df, transactions_df = dfs |
| 48 | + |
| 49 | + # first update the transactions df to have a recipient name tied to id |
| 50 | + transactions_df["recipient_name"] = transactions_df["recipient_id"].apply( |
| 51 | + name_identifier, args=([orgs_df, inds_df],) |
| 52 | + ) |
| 53 | + |
| 54 | + # next, merge the inds_df and orgs_df ids with the transactions_df donor_id |
| 55 | + inds_trans_df = pd.merge( |
| 56 | + inds_df, transactions_df, how="left", left_on="id", right_on="donor_id" |
| 57 | + ) |
| 58 | + inds_trans_df = inds_trans_df.dropna(subset=["amount"]) |
| 59 | + orgs_trans_df = pd.merge( |
| 60 | + orgs_df, transactions_df, how="left", left_on="id", right_on="donor_id" |
| 61 | + ) |
| 62 | + orgs_trans_df = orgs_trans_df.dropna(subset=["amount"]) |
| 63 | + orgs_trans_df = orgs_trans_df.rename(columns={"name": "full_name"}) |
| 64 | + |
| 65 | + # concatenated the merged dfs |
| 66 | + merged_df = pd.concat([orgs_trans_df, inds_trans_df]) |
| 67 | + |
| 68 | + # lastly, create the final dataframe with aggregated attributes |
| 69 | + attribute_cols = merged_df.columns.difference( |
| 70 | + ["donor_id", "recipient_id", "full_name", "recipient_name"] |
| 71 | + ) |
| 72 | + agg_functions = { |
| 73 | + col: "sum" if col == "amount" else "first" for col in attribute_cols |
| 74 | + } |
| 75 | + aggreg_df = ( |
| 76 | + merged_df.groupby( |
| 77 | + ["donor_id", "recipient_id", "full_name", "recipient_name"] |
| 78 | + ) |
| 79 | + .agg(agg_functions) |
| 80 | + .reset_index() |
| 81 | + ) |
| 82 | + aggreg_df = aggreg_df.drop(["id"], axis=1) |
| 83 | + return aggreg_df |
| 84 | + |
| 85 | + |
| 86 | +def create_network_graph(df: pd.DataFrame) -> nx.MultiDiGraph: |
| 87 | + """Takes in a dataframe and generates a MultiDiGraph where the nodes are |
| 88 | + entity names, and the rest of the dataframe columns make the node attributes |
| 89 | +
|
| 90 | + Args: |
| 91 | + df: a pandas dataframe with merged information from the inds, orgs, & |
| 92 | + transactions dataframes |
| 93 | +
|
| 94 | + Returns: |
| 95 | + A Networkx MultiDiGraph with nodes and edges |
| 96 | + """ |
| 97 | + G = nx.MultiDiGraph() |
| 98 | + edge_columns = [ |
| 99 | + "office_sought", |
| 100 | + "purpose", |
| 101 | + "transaction_type", |
| 102 | + "year", |
| 103 | + "transaction_id", |
| 104 | + "donor_office", |
| 105 | + "amount", |
| 106 | + ] |
| 107 | + |
| 108 | + for _, row in df.iterrows(): |
| 109 | + # add node attributes based on the columns relevant to the entity |
| 110 | + G.add_node( |
| 111 | + row["full_name"], |
| 112 | + **row[df.columns.difference(edge_columns)].dropna().to_dict(), |
| 113 | + ) |
| 114 | + # add the recipient as a node |
| 115 | + G.add_node(row["recipient_name"], classification="neutral") |
| 116 | + |
| 117 | + # add the edge attributes between two nodes |
| 118 | + edge_attributes = row[edge_columns].dropna().to_dict() |
| 119 | + G.add_edge(row["full_name"], row["recipient_name"], **edge_attributes) |
| 120 | + |
| 121 | + return G |
| 122 | + |
| 123 | + |
| 124 | +def plot_network_graph(G: nx.MultiDiGraph): |
| 125 | + """Given a networkX Graph, creates a plotly visualization of the nodes and |
| 126 | + edges |
| 127 | +
|
| 128 | + Args: |
| 129 | + A networkX MultiDiGraph with edges including the attribute 'amount' |
| 130 | +
|
| 131 | + Returns: None. Creates a plotly graph |
| 132 | + """ |
| 133 | + edge_trace = go.Scatter( |
| 134 | + x=(), |
| 135 | + y=(), |
| 136 | + line=dict(color="#888", width=1.5), |
| 137 | + hoverinfo="text", |
| 138 | + mode="lines+markers", |
| 139 | + ) |
| 140 | + hovertext = [] |
| 141 | + pos = nx.spring_layout(G) |
| 142 | + |
| 143 | + for edge in G.edges(data=True): |
| 144 | + source = edge[0] |
| 145 | + target = edge[1] |
| 146 | + hovertext.append(f"Amount: {edge[2]['amount']:.2f}") |
| 147 | + # Adding coordinates of source and target nodes to edge_trace |
| 148 | + edge_trace["x"] += ( |
| 149 | + pos[source][0], |
| 150 | + pos[target][0], |
| 151 | + None, |
| 152 | + ) # None creates a gap between line segments |
| 153 | + edge_trace["y"] += (pos[source][1], pos[target][1], None) |
| 154 | + |
| 155 | + edge_trace["hovertext"] = hovertext |
| 156 | + |
| 157 | + # Define arrow symbol for edges |
| 158 | + edge_trace["marker"] = dict( |
| 159 | + symbol="arrow", color="#888", size=10, angleref="previous" |
| 160 | + ) |
| 161 | + |
| 162 | + node_trace = go.Scatter( |
| 163 | + x=[], |
| 164 | + y=[], |
| 165 | + text=[], |
| 166 | + mode="markers", |
| 167 | + hoverinfo="text", |
| 168 | + marker=dict(showscale=True, colorscale="YlGnBu", size=10), |
| 169 | + ) |
| 170 | + node_trace["marker"]["color"] = [] |
| 171 | + |
| 172 | + for node in G.nodes(): |
| 173 | + node_info = f"Name: {node}<br>" |
| 174 | + for key, value in G.nodes[node].items(): |
| 175 | + node_info += f"{key}: {value}<br>" |
| 176 | + node_trace["text"] += tuple([node_info]) |
| 177 | + # Get the classification value for the node |
| 178 | + classification = G.nodes[node].get("classification", "neutral") |
| 179 | + # Assign a color based on the classification value |
| 180 | + if classification == "c": |
| 181 | + color = "blue" |
| 182 | + elif classification == "f": |
| 183 | + color = "red" |
| 184 | + else: |
| 185 | + color = "green" # Default color for unknown classification |
| 186 | + node_trace["marker"]["color"] += tuple([color]) |
| 187 | + |
| 188 | + # Add node positions to the trace |
| 189 | + node_trace["x"] += tuple([pos[node][0]]) |
| 190 | + node_trace["y"] += tuple([pos[node][1]]) |
| 191 | + |
| 192 | + # Define layout settings |
| 193 | + layout = go.Layout( |
| 194 | + title="Network Graph Indicating Campaign Contributions from 2018-2022", |
| 195 | + titlefont=dict(size=16), |
| 196 | + showlegend=True, |
| 197 | + hovermode="closest", |
| 198 | + margin=dict(b=20, l=5, r=5, t=40), |
| 199 | + xaxis=dict(showgrid=True, zeroline=True, showticklabels=False), |
| 200 | + yaxis=dict(showgrid=True, zeroline=True, showticklabels=False), |
| 201 | + ) |
| 202 | + |
| 203 | + fig = go.Figure(data=[edge_trace, node_trace], layout=layout) |
| 204 | + fig.show() |
| 205 | + |
| 206 | + |
| 207 | +def construct_network_graph( |
| 208 | + start_year: int, end_year: int, dfs: list[pd.DataFrame] |
| 209 | +): |
| 210 | + """Runs the network construction pipeline starting from 3 dataframes |
| 211 | +
|
| 212 | + Args: |
| 213 | + start_year & end_year: the range of the desired data |
| 214 | + dfs: dataframes in the order: inds_df, orgs_df, transactions_df |
| 215 | +
|
| 216 | + Returns: |
| 217 | + """ |
| 218 | + inds_df, orgs_df, transactions_df = dfs |
| 219 | + transactions_df = transactions_df.loc[ |
| 220 | + (transactions_df.year >= start_year) |
| 221 | + & (transactions_df.year <= end_year) |
| 222 | + ] |
| 223 | + |
| 224 | + aggreg_df = combine_datasets_for_network_graph( |
| 225 | + [inds_df, orgs_df, transactions_df] |
| 226 | + ) |
| 227 | + G = create_network_graph(aggreg_df) |
| 228 | + plot_network_graph(G) |
| 229 | + nx.write_adjlist(G, "Network Graph Node Data") |
0 commit comments