From e35799dc6d289a538fafdd1c5ba34ff4c5ad434a Mon Sep 17 00:00:00 2001 From: mikhail Date: Thu, 1 Aug 2024 22:25:27 +0300 Subject: [PATCH 1/9] add Nettack attack --- experiments/attack_defense_test.py | 141 ++++- metainfo/poison_attack_parameters.json | 7 + src/attacks/nettack/nettack.py | 701 +++++++++++++++++++++++++ src/attacks/nettack/utils.py | 112 ++++ src/attacks/poison_attacks.py | 99 ++++ src/models_builder/models_zoo.py | 4 +- 6 files changed, 1058 insertions(+), 6 deletions(-) create mode 100644 src/attacks/nettack/nettack.py create mode 100644 src/attacks/nettack/utils.py diff --git a/experiments/attack_defense_test.py b/experiments/attack_defense_test.py index c82ccbd..a8dfa60 100644 --- a/experiments/attack_defense_test.py +++ b/experiments/attack_defense_test.py @@ -144,10 +144,10 @@ def test_attack_defense(): } ) - # gnn_model_manager.set_poison_attacker(poison_attack_config=poison_attack_config) + gnn_model_manager.set_poison_attacker(poison_attack_config=poison_attack_config) # gnn_model_manager.set_poison_defender(poison_defense_config=poison_defense_config) - gnn_model_manager.set_evasion_attacker(evasion_attack_config=evasion_attack_config) - gnn_model_manager.set_evasion_defender(evasion_defense_config=evasion_defense_config) + # gnn_model_manager.set_evasion_attacker(evasion_attack_config=evasion_attack_config) + # gnn_model_manager.set_evasion_defender(evasion_defense_config=evasion_defense_config) warnings.warn("Start training") dataset.train_test_split() @@ -175,7 +175,140 @@ def test_attack_defense(): print(metric_loc) +def test_nettack_attack(): + my_device = device('cpu') + + # Choose the node index + node_idx = 1900 + + # Load dataset + full_name = ("single-graph", "Planetoid", 'Cora') + dataset, data, results_dataset_path = DatasetManager.get_by_full_name( + full_name=full_name, + dataset_ver_ind=0 + ) + + n = dataset.data.x.size(0) + train_mask = torch.full((2708,), True, dtype=torch.bool) + num_false_elements = int(n * 0.3) + indices = torch.randperm(n)[1:num_false_elements] + train_mask.index_fill_(0, indices, False) + train_mask[node_idx] = False + + dataset.train_mask = train_mask + dataset.test_mask =~ train_mask + + dataset.val_mask = torch.full((2708,), False, dtype=torch.bool) + # dataset.test_mask = torch.full((2708,), False, dtype=torch.bool) + + # Train model on original dataset and remember the model metric and node predicted probability + gcn_gcn = model_configs_zoo(dataset=dataset, model_name='gcn_gcn') + + manager_config = ConfigPattern( + _config_class="ModelManagerConfig", + _config_kwargs={ + "mask_features": [], + "optimizer": { + "_class_name": "Adam", + "_config_kwargs": {}, + } + } + ) + + steps_epochs = 2000 + gcn_gcn_model_manager = FrameworkGNNModelManager( + gnn=gcn_gcn, + dataset_path=results_dataset_path, + manager_config=manager_config, + modification=ModelModificationConfig(model_ver_ind=0, epochs=0) + ) + + gcn_gcn_model_manager.gnn.to(my_device) + + train_test_split_path = gcn_gcn_model_manager.train_model(gen_dataset=dataset, + steps=steps_epochs, + save_model_flag=False, + metrics=[Metric("F1", mask='train', average=None)]) + + # save train_test_mask to test the model on poisoned data with the same split + dataset.save_train_test_mask(train_test_split_path) # TODO сделать сохранение разбиения test/train + + metric_original_dataset = gcn_gcn_model_manager.evaluate_model( + gen_dataset=dataset, + metrics=[Metric("Accuracy", mask='test')])['test'] + + gcn_gcn_model_manager.gnn.eval() + with torch.no_grad(): + probabilities = torch.exp(gcn_gcn_model_manager.gnn(dataset.data.x, dataset.data.edge_index)) + + predicted_class = probabilities[node_idx].argmax().item() + predicted_probability = probabilities[node_idx][predicted_class].item() + real_class = dataset.data.y[node_idx].item() + + original_dataset_predictions_info = {"metric_original_dataset": metric_original_dataset, + "node_idx": node_idx, + "predicted_class": predicted_class, + "predicted_probability": predicted_probability, + "real_class": real_class} + + # Attack + nettack_poison_attack_config = ConfigPattern( + _class_name="NettackPoisonAttack", + _import_path=POISON_ATTACK_PARAMETERS_PATH, + _config_class="PoisonAttackConfig", + _config_kwargs={ + "node_idx": node_idx, + "direct_attack": True, + "n_influencers": 5, + "perturb_features": True, + "perturb_structure": True + } + ) + new_gcn_gcn = model_configs_zoo(dataset=dataset, model_name='gcn_gcn') + new_gcn_gcn_model_manager = FrameworkGNNModelManager( + gnn=new_gcn_gcn, + dataset_path=results_dataset_path, + manager_config=manager_config, + modification=ModelModificationConfig(model_ver_ind=0, epochs=0) + ) + + new_gcn_gcn_model_manager.set_poison_attacker(poison_attack_config=nettack_poison_attack_config) + + # TODO сделать сохранение разбиения test/train + # train_mask, val_mask, test_mask, train_test_sizes = torch.load(train_test_split_path / 'train_test_split')[:] + # dataset.train_mask, dataset.val_mask, dataset.test_mask = train_mask, val_mask, test_mask + # data.percent_train_class, data.percent_test_class = train_test_sizes + + new_gcn_gcn_model_manager.train_model(gen_dataset=dataset, + steps=steps_epochs, + save_model_flag=False, + metrics=[Metric("F1", mask='train', average=None)]) + + metric_poison_dataset = new_gcn_gcn_model_manager.evaluate_model( + gen_dataset=new_gcn_gcn_model_manager.poison_attacker.attack_diff, + metrics=[Metric("Accuracy", mask='test')])['test'] + + new_gcn_gcn_model_manager.gnn.eval() + with torch.no_grad(): + probabilities = torch.exp(new_gcn_gcn_model_manager.gnn(new_gcn_gcn_model_manager.poison_attacker.attack_diff.data.x, + new_gcn_gcn_model_manager.poison_attacker.attack_diff.data.edge_index)) + + predicted_class = probabilities[node_idx].argmax().item() + predicted_probability = probabilities[node_idx][predicted_class].item() + real_class = dataset.data.y[node_idx].item() + + poisoned_dataset_predictions_info = {"metric_poison_dataset": metric_poison_dataset, + "node_idx": node_idx, + "predicted_class": predicted_class, + "predicted_probability": predicted_probability, + "real_class": real_class} + + print(original_dataset_predictions_info) + print(poisoned_dataset_predictions_info) + + if __name__ == '__main__': - test_attack_defense() + # test_attack_defense() + test_nettack_attack() diff --git a/metainfo/poison_attack_parameters.json b/metainfo/poison_attack_parameters.json index 980cabd..ab032c7 100644 --- a/metainfo/poison_attack_parameters.json +++ b/metainfo/poison_attack_parameters.json @@ -3,6 +3,13 @@ }, "RandomPoisonAttack": { "n_edges_percent": ["n_edges_percent", "float", 0.1, {"min": 0.0001, "step": 0.01}, "?"] + }, + "NettackPoisonAttack": { + "node_idx": ["node_idx", "int", 0, {"min": 0, "step": 1}, "Index of teh node to attack"], + "direct_attack": ["direct_attack", "bool", true, {}, "?"], + "n_influencers": ["n_influencers", "int", 5, {"min": 1, "step": 1}, "?"], + "perturb_features": ["perturb_features", "bool", true, {}, "?"], + "perturb_structure": ["perturb_structure", "bool", true, {}, "?"] } } diff --git a/src/attacks/nettack/nettack.py b/src/attacks/nettack/nettack.py new file mode 100644 index 0000000..11784e7 --- /dev/null +++ b/src/attacks/nettack/nettack.py @@ -0,0 +1,701 @@ +import numpy as np +import scipy.sparse as sp +from src.attacks.nettack.utils import preprocess_graph +import torch + + +class Nettack: + """ + Nettack class used for poisoning attacks on node classification models. + Copyright (C) 2018 + Daniel Zügner + Technical University of Munich + """ + + def __init__(self, adj, X_obs, z_obs, W1, W2, u, verbose=False): + + # Adjacency matrix + self.adj = adj.copy().tolil() + self.adj_no_selfloops = self.adj.copy() + self.adj_no_selfloops.setdiag(0) + self.adj_orig = self.adj.copy().tolil() + self.u = u # the node being attacked + self.adj_preprocessed = preprocess_graph(self.adj).tolil() + # Number of nodes + self.N = adj.shape[0] + + # Node attributes + self.X_obs = X_obs.copy().tolil() + self.X_obs_orig = self.X_obs.copy().tolil() + # Node labels + self.z_obs = z_obs.copy() + self.label_u = self.z_obs[self.u] + self.K = np.max(self.z_obs)+1 + # GCN weight matrices + self.W1 = W1 + self.W2 = W2 + # self.W = sp.csr_matrix(self.W1.dot(self.W2)) + self.W = sp.csr_matrix(torch.matmul(self.W1, self.W2).detach().numpy()) + + self.cooc_matrix = self.X_obs.T.dot(self.X_obs).tolil() + self.cooc_constraint = None + + self.structure_perturbations = [] + self.feature_perturbations = [] + + self.influencer_nodes = [] + self.potential_edges = [] + self.verbose = verbose + + def compute_cooccurrence_constraint(self, nodes): + """ + Co-occurrence constraint as described in the paper. + + Parameters + ---------- + nodes: np.array + Nodes whose features are considered for change + + Returns + ------- + np.array [len(nodes), D], dtype bool + Binary matrix of dimension len(nodes) x D. A 1 in entry n,d indicates that + we are allowed to add feature d to the features of node n. + + """ + + words_graph = self.cooc_matrix.copy() + D = self.X_obs.shape[1] + words_graph.setdiag(0) + words_graph = (words_graph > 0) + word_degrees = np.sum(words_graph, axis=0).A1 + + inv_word_degrees = np.reciprocal(word_degrees.astype(float) + 1e-8) + + sd = np.zeros([self.N]) + for n in range(self.N): + n_idx = self.X_obs[n, :].nonzero()[1] + sd[n] = np.sum(inv_word_degrees[n_idx.tolist()]) + + scores_matrix = sp.lil_matrix((self.N, D)) + + for n in nodes: + common_words = words_graph.multiply(self.X_obs[n]) + idegs = inv_word_degrees[common_words.nonzero()[1]] + nnz = common_words.nonzero()[0] + scores = np.array([idegs[nnz == ix].sum() for ix in range(D)]) + scores_matrix[n] = scores + self.cooc_constraint = sp.csr_matrix(scores_matrix - 0.5 * sd[:, None] > 0) + + + def gradient_wrt_x(self, label): + """ + Compute the gradient of the logit belonging to the class of the input label with respect to the input features. + + Parameters + ---------- + label: int + Class whose logits are of interest + + Returns + ------- + np.array [N, D] matrix containing the gradients. + + """ + + return self.adj_preprocessed.dot(self.adj_preprocessed)[self.u].T.dot(self.W[:, label].T) + + def compute_logits(self): + """ + Compute the logits of the surrogate model, i.e. linearized GCN. + + Returns + ------- + np.array, [N, K] + The log probabilities for each node. + + """ + return self.adj_preprocessed.dot(self.adj_preprocessed).dot(self.X_obs.dot(self.W))[self.u].toarray()[0] + + def strongest_wrong_class(self, logits): + """ + Determine the incorrect class with largest logits. + + Parameters + ---------- + logits: np.array, [N, K] + The input logits + + Returns + ------- + np.array, [N, L] + The indices of the wrong labels with the highest attached log probabilities. + """ + + label_u_onehot = np.eye(self.K)[self.label_u] + return (logits - 1000*label_u_onehot).argmax() + + def feature_scores(self): + """ + Compute feature scores for all possible feature changes. + """ + + if self.cooc_constraint is None: + self.compute_cooccurrence_constraint(self.influencer_nodes) + logits = self.compute_logits() + best_wrong_class = self.strongest_wrong_class(logits) + gradient = self.gradient_wrt_x(self.label_u) - self.gradient_wrt_x(best_wrong_class) + surrogate_loss = logits[self.label_u] - logits[best_wrong_class] + + gradients_flipped = (gradient * -1).tolil() + gradients_flipped[self.X_obs.nonzero()] *= -1 + + X_influencers = sp.lil_matrix(self.X_obs.shape) + X_influencers[self.influencer_nodes] = self.X_obs[self.influencer_nodes] + gradients_flipped = gradients_flipped.multiply((self.cooc_constraint + X_influencers) > 0) + nnz_ixs = np.array(gradients_flipped.nonzero()).T + + sorting = np.argsort(gradients_flipped[tuple(nnz_ixs.T)]).A1 + sorted_ixs = nnz_ixs[sorting] + grads = gradients_flipped[tuple(nnz_ixs[sorting].T)] + + scores = surrogate_loss - grads + return sorted_ixs[::-1], scores.A1[::-1] + + def struct_score(self, a_hat_uv, XW): + """ + Compute structure scores, cf. Eq. 15 in the paper + + Parameters + ---------- + a_hat_uv: sp.sparse_matrix, shape [P,2] + Entries of matrix A_hat^2_u for each potential edge (see paper for explanation) + + XW: sp.sparse_matrix, shape [N, K], dtype float + The class logits for each node. + + Returns + ------- + np.array [P,] + The struct score for every row in a_hat_uv + """ + + logits = a_hat_uv.dot(XW) + label_onehot = np.eye(XW.shape[1])[self.label_u] + best_wrong_class_logits = (logits - 1000 * label_onehot).max(1) + logits_for_correct_class = logits[:,self.label_u] + struct_scores = logits_for_correct_class - best_wrong_class_logits + + return struct_scores + + def compute_XW(self): + """ + Shortcut to compute the dot product of X and W + Returns + ------- + X.dot(W) + """ + + return self.X_obs.dot(self.W) + + def get_attacker_nodes(self, n=5, add_additional_nodes = False): + """ + Determine the influencer nodes to attack node i based on the weights W and the attributes X. + + Parameters + ---------- + n: int, default: 5 + The desired number of attacker nodes. + + add_additional_nodes: bool, default: False + if True and the degree of node i (d_u) is < n, we select n-d_u additional attackers, which should + get connected to u afterwards (outside this function). + + Returns + ------- + np.array, shape [n,]: + The indices of the attacker nodes. + optional: np.array, shape [n - degree(n)] + if additional_nodes is True, we separately + return the additional attacker node indices + + """ + + assert n < self.N-1, "number of influencers cannot be >= number of nodes in the graph!" + + neighbors = self.adj_no_selfloops[self.u].nonzero()[1] + assert self.u not in neighbors + + potential_edges = np.column_stack((np.tile(self.u, len(neighbors)),neighbors)).astype("int32") + + # The new A_hat_square_uv values that we would get if we removed the edge from u to each of the neighbors, + # respectively + a_hat_uv = self.compute_new_a_hat_uv(potential_edges) + + XW = self.compute_XW() + + # compute the struct scores for all neighbors + struct_scores = self.struct_score(a_hat_uv, XW).A1 + + if len(neighbors) >= n: # do we have enough neighbors for the number of desired influencers? + influencer_nodes = neighbors[np.argsort(struct_scores)[:n]] + if add_additional_nodes: + return influencer_nodes, np.array([]) + return influencer_nodes + else: + influencer_nodes = neighbors + if add_additional_nodes: # Add additional influencers by connecting them to u first. + # Compute the set of possible additional influencers, i.e. all nodes except the ones + # that are already connected to u. + poss_add_infl = np.setdiff1d(np.setdiff1d(np.arange(self.N),neighbors), self.u) + n_possible_additional = len(poss_add_infl) + n_additional_attackers = n-len(neighbors) + possible_edges = np.column_stack((np.tile(self.u, n_possible_additional), poss_add_infl)) + + # Compute the struct_scores for all possible additional influencers, and choose the one + # with the best struct score. + a_hat_uv_additional = self.compute_new_a_hat_uv(possible_edges) + additional_struct_scores = self.struct_score(a_hat_uv_additional, XW) + additional_influencers = poss_add_infl[np.argsort(additional_struct_scores)[-n_additional_attackers::]] + + return influencer_nodes, additional_influencers + else: + return influencer_nodes + + def compute_new_a_hat_uv(self, potential_edges): + """ + Compute the updated A_hat_square_uv entries that would result from inserting/deleting the input edges, + for every edge. + + Parameters + ---------- + potential_edges: np.array, shape [P,2], dtype int + The edges to check. + + Returns + ------- + sp.sparse_matrix: updated A_hat_square_u entries, a sparse PxN matrix, where P is len(possible_edges). + """ + + edges = np.array(self.adj.nonzero()).T + edges_set = {tuple(x) for x in edges} + A_hat_sq = self.adj_preprocessed @ self.adj_preprocessed + values_before = A_hat_sq[self.u].toarray()[0] + node_ixs = np.unique(edges[:, 0], return_index=True)[1] + twohop_ixs = np.array(A_hat_sq.nonzero()).T + degrees = self.adj.sum(0).A1 + 1 + + ixs, vals = compute_new_a_hat_uv(edges, node_ixs, edges_set, twohop_ixs, values_before, degrees, + potential_edges, self.u) + ixs_arr = np.array(ixs) + a_hat_uv = sp.coo_matrix((vals, (ixs_arr[:, 0], ixs_arr[:, 1])), shape=[len(potential_edges), self.N]) + + return a_hat_uv + + def attack_surrogate(self, n_perturbations, perturb_structure=True, perturb_features=True, + direct=True, n_influencers=0, delta_cutoff=0.004): + """ + Perform an attack on the surrogate model. + + Parameters + ---------- + n_perturbations: int + The number of perturbations (structure or feature) to perform. + + perturb_structure: bool, default: True + Indicates whether the structure can be changed. + + perturb_features: bool, default: True + Indicates whether the features can be changed. + + direct: bool, default: True + indicates whether to directly modify edges/features of the node attacked or only those of influencers. + + n_influencers: int, default: 0 + Number of influencing nodes -- will be ignored if direct is True + + delta_cutoff: float + The critical value for the likelihood ratio test of the power law distributions. + See the Chi square distribution with one degree of freedom. Default value 0.004 + corresponds to a p-value of roughly 0.95. + + Returns + ------- + None. + + """ + + assert not (direct==False and n_influencers==0), "indirect mode requires at least one influencer node" + assert n_perturbations > 0, "need at least one perturbation" + assert perturb_features or perturb_structure, "either perturb_features or perturb_structure must be true" + + logits_start = self.compute_logits() + best_wrong_class = self.strongest_wrong_class(logits_start) + surrogate_losses = [logits_start[self.label_u] - logits_start[best_wrong_class]] + + if self.verbose: + print("##### Starting attack #####") + if perturb_structure and perturb_features: + print("##### Attack node with ID {} using structure and feature perturbations #####".format(self.u)) + elif perturb_features: + print("##### Attack only using feature perturbations #####") + elif perturb_structure: + print("##### Attack only using structure perturbations #####") + if direct: + print("##### Attacking the node directly #####") + else: + print("##### Attacking the node indirectly via {} influencer nodes #####".format(n_influencers)) + print("##### Performing {} perturbations #####".format(n_perturbations)) + + if perturb_structure: + + # Setup starting values of the likelihood ratio test. + degree_sequence_start = self.adj_orig.sum(0).A1 + current_degree_sequence = self.adj.sum(0).A1 + d_min = 2 + S_d_start = np.sum(np.log(degree_sequence_start[degree_sequence_start >= d_min])) + current_S_d = np.sum(np.log(current_degree_sequence[current_degree_sequence >= d_min])) + n_start = np.sum(degree_sequence_start >= d_min) + current_n = np.sum(current_degree_sequence >= d_min) + alpha_start = compute_alpha(n_start, S_d_start, d_min) + log_likelihood_orig = compute_log_likelihood(n_start, alpha_start, S_d_start, d_min) + + if len(self.influencer_nodes) == 0: + if not direct: + # Choose influencer nodes + infls, add_infls = self.get_attacker_nodes(n_influencers, add_additional_nodes=True) + self.influencer_nodes= np.concatenate((infls, add_infls)).astype("int") + # Potential edges are all edges from any attacker to any other node, except the respective + # attacker itself or the node being attacked. + self.potential_edges = np.row_stack([np.column_stack((np.tile(infl, self.N - 2), + np.setdiff1d(np.arange(self.N), + np.array([self.u,infl])))) for infl in + self.influencer_nodes]) + if self.verbose: + print("Influencer nodes: {}".format(self.influencer_nodes)) + else: + # direct attack + influencers = [self.u] + self.potential_edges = np.column_stack((np.tile(self.u, self.N-1), np.setdiff1d(np.arange(self.N), self.u))) + self.influencer_nodes = np.array(influencers) + self.potential_edges = self.potential_edges.astype("int32") + for _ in range(n_perturbations): + if self.verbose: + print("##### ...{}/{} perturbations ... #####".format(_+1, n_perturbations)) + if perturb_structure: + + # Do not consider edges that, if removed, result in singleton edges in the graph. + singleton_filter = filter_singletons(self.potential_edges, self.adj) + filtered_edges = self.potential_edges[singleton_filter] + + # Update the values for the power law likelihood ratio test. + deltas = 2 * (1 - self.adj[tuple(filtered_edges.T)].toarray()[0] )- 1 + d_edges_old = current_degree_sequence[filtered_edges] + d_edges_new = current_degree_sequence[filtered_edges] + deltas[:, None] + new_S_d, new_n = update_Sx(current_S_d, current_n, d_edges_old, d_edges_new, d_min) + new_alphas = compute_alpha(new_n, new_S_d, d_min) + new_ll = compute_log_likelihood(new_n, new_alphas, new_S_d, d_min) + alphas_combined = compute_alpha(new_n + n_start, new_S_d + S_d_start, d_min) + new_ll_combined = compute_log_likelihood(new_n + n_start, alphas_combined, new_S_d + S_d_start, d_min) + new_ratios = -2 * new_ll_combined + 2 * (new_ll + log_likelihood_orig) + + # Do not consider edges that, if added/removed, would lead to a violation of the + # likelihood ration Chi_square cutoff value. + powerlaw_filter = filter_chisquare(new_ratios, delta_cutoff) + filtered_edges_final = filtered_edges[powerlaw_filter] + + # Compute new entries in A_hat_square_uv + a_hat_uv_new = self.compute_new_a_hat_uv(filtered_edges_final) + # Compute the struct scores for each potential edge + struct_scores = self.struct_score(a_hat_uv_new, self.compute_XW()) + best_edge_ix = struct_scores.argmin() + best_edge_score = struct_scores.min() + best_edge = filtered_edges_final[best_edge_ix] + + if perturb_features: + # Compute the feature scores for each potential feature perturbation + feature_ixs, feature_scores = self.feature_scores() + best_feature_ix = feature_ixs[0] + best_feature_score = feature_scores[0] + + if perturb_structure and perturb_features: + # decide whether to choose an edge or feature to change + if best_edge_score < best_feature_score: + if self.verbose: + print("Edge perturbation: {}".format(best_edge)) + change_structure = True + else: + if self.verbose: + print("Feature perturbation: {}".format(best_feature_ix)) + change_structure=False + elif perturb_structure: + change_structure = True + elif perturb_features: + change_structure = False + + if change_structure: + # perform edge perturbation + + self.adj[tuple(best_edge)] = self.adj[tuple(best_edge[::-1])] = 1 - self.adj[tuple(best_edge)] + self.adj_preprocessed = preprocess_graph(self.adj) + + self.structure_perturbations.append(tuple(best_edge)) + self.feature_perturbations.append(()) + surrogate_losses.append(best_edge_score) + + # Update likelihood ratio test values + current_S_d = new_S_d[powerlaw_filter][best_edge_ix] + current_n = new_n[powerlaw_filter][best_edge_ix] + current_degree_sequence[best_edge] += deltas[powerlaw_filter][best_edge_ix] + + else: + self.X_obs[tuple(best_feature_ix)] = 1 - self.X_obs[tuple(best_feature_ix)] + + self.feature_perturbations.append(tuple(best_feature_ix)) + self.structure_perturbations.append(()) + surrogate_losses.append(best_feature_score) + + def reset(self): + """ + Reset Nettack + """ + self.adj = self.adj_orig.copy() + self.X_obs = self.X_obs_orig.copy() + self.structure_perturbations = [] + self.feature_perturbations = [] + self.influencer_nodes = [] + self.potential_edges = [] + self.cooc_constraint = None + + +# @jit(nopython=True) +def connected_after(u, v, connected_before, delta): + if u == v: + if delta == -1: + return False + else: + return True + else: + return connected_before + +# @jit(nopython=True) +def compute_new_a_hat_uv(edge_ixs, node_nb_ixs, edges_set, twohop_ixs, values_before, degs, potential_edges, u): + """ + Compute the new values [A_hat_square]_u for every potential edge, where u is the target node. C.f. Theorem 5.1 + equation 17. + + Parameters + ---------- + edge_ixs: np.array, shape [E,2], where E is the number of edges in the graph. + The indices of the nodes connected by the edges in the input graph. + node_nb_ixs: np.array, shape [N,], dtype int + For each node, this gives the first index of edges associated to this node in the edge array (edge_ixs). + This will be used to quickly look up the neighbors of a node, since numba does not allow nested lists. + edges_set: set((e0, e1)) + The set of edges in the input graph, i.e. e0 and e1 are two nodes connected by an edge + twohop_ixs: np.array, shape [T, 2], where T is the number of edges in A_tilde^2 + The indices of nodes that are in the twohop neighborhood of each other, including self-loops. + values_before: np.array, shape [N,], the values in [A_hat]^2_uv to be updated. + degs: np.array, shape [N,], dtype int + The degree of the nodes in the input graph. + potential_edges: np.array, shape [P, 2], where P is the number of potential edges. + The potential edges to be evaluated. For each of these potential edges, this function will compute the values + in [A_hat]^2_uv that would result after inserting/removing this edge. + u: int + The target node + + Returns + ------- + return_ixs: List of tuples + The ixs in the [P, N] matrix of updated values that have changed + return_values: + + """ + N = degs.shape[0] + + twohop_u = twohop_ixs[twohop_ixs[:, 0] == u, 1] + nbs_u = edge_ixs[edge_ixs[:, 0] == u, 1] + nbs_u_set = set(nbs_u) + + return_ixs = [] + return_values = [] + + for ix in range(len(potential_edges)): + edge = potential_edges[ix] + edge_set = set(edge) + degs_new = degs.copy() + delta = -2 * ((edge[0], edge[1]) in edges_set) + 1 + degs_new[edge] += delta + + nbs_edge0 = edge_ixs[edge_ixs[:, 0] == edge[0], 1] + nbs_edge1 = edge_ixs[edge_ixs[:, 0] == edge[1], 1] + + affected_nodes = set(np.concatenate((twohop_u, nbs_edge0, nbs_edge1))) + affected_nodes = affected_nodes.union(edge_set) + a_um = edge[0] in nbs_u_set + a_un = edge[1] in nbs_u_set + + a_un_after = connected_after(u, edge[0], a_un, delta) + a_um_after = connected_after(u, edge[1], a_um, delta) + + for v in affected_nodes: + a_uv_before = v in nbs_u_set + a_uv_before_sl = a_uv_before or v == u + + if v in edge_set and u in edge_set and u != v: + if delta == -1: + a_uv_after = False + else: + a_uv_after = True + else: + a_uv_after = a_uv_before + a_uv_after_sl = a_uv_after or v == u + + from_ix = node_nb_ixs[v] + to_ix = node_nb_ixs[v + 1] if v < N - 1 else len(edge_ixs) + node_nbs = edge_ixs[from_ix:to_ix, 1] + node_nbs_set = set(node_nbs) + a_vm_before = edge[0] in node_nbs_set + + a_vn_before = edge[1] in node_nbs_set + a_vn_after = connected_after(v, edge[0], a_vn_before, delta) + a_vm_after = connected_after(v, edge[1], a_vm_before, delta) + + mult_term = 1 / np.sqrt(degs_new[u] * degs_new[v]) + + sum_term1 = np.sqrt(degs[u] * degs[v]) * values_before[v] - a_uv_before_sl / degs[u] - a_uv_before / \ + degs[v] + sum_term2 = a_uv_after / degs_new[v] + a_uv_after_sl / degs_new[u] + sum_term3 = -((a_um and a_vm_before) / degs[edge[0]]) + (a_um_after and a_vm_after) / degs_new[edge[0]] + sum_term4 = -((a_un and a_vn_before) / degs[edge[1]]) + (a_un_after and a_vn_after) / degs_new[edge[1]] + new_val = mult_term * (sum_term1 + sum_term2 + sum_term3 + sum_term4) + + return_ixs.append((ix, v)) + return_values.append(new_val) + + return return_ixs, return_values + +def compute_alpha(n, S_d, d_min): + """ + Approximate the alpha of a power law distribution. + + Parameters + ---------- + n: int or np.array of int + Number of entries that are larger than or equal to d_min + + S_d: float or np.array of float + Sum of log degrees in the distribution that are larger than or equal to d_min + + d_min: int + The minimum degree of nodes to consider + + Returns + ------- + alpha: float + The estimated alpha of the power law distribution + """ + + return n / (S_d - n * np.log(d_min - 0.5)) + 1 + + +def update_Sx(S_old, n_old, d_old, d_new, d_min): + """ + Update on the sum of log degrees S_d and n based on degree distribution resulting from inserting or deleting + a single edge. + + Parameters + ---------- + S_old: float + Sum of log degrees in the distribution that are larger than or equal to d_min. + + n_old: int + Number of entries in the old distribution that are larger than or equal to d_min. + + d_old: np.array, shape [N,] dtype int + The old degree sequence. + + d_new: np.array, shape [N,] dtype int + The new degree sequence + + d_min: int + The minimum degree of nodes to consider + + Returns + ------- + new_S_d: float, the updated sum of log degrees in the distribution that are larger than or equal to d_min. + new_n: int, the updated number of entries in the old distribution that are larger than or equal to d_min. + """ + + old_in_range = d_old >= d_min + new_in_range = d_new >= d_min + + d_old_in_range = np.multiply(d_old, old_in_range) + d_new_in_range = np.multiply(d_new, new_in_range) + + new_S_d = S_old - np.log(np.maximum(d_old_in_range, 1)).sum(1) + np.log(np.maximum(d_new_in_range, 1)).sum(1) + new_n = n_old - np.sum(old_in_range, 1) + np.sum(new_in_range, 1) + + return new_S_d, new_n + + +def compute_log_likelihood(n, alpha, S_d, d_min): + """ + Compute log likelihood of the powerlaw fit. + + Parameters + ---------- + n: int + Number of entries in the old distribution that are larger than or equal to d_min. + + alpha: float + The estimated alpha of the power law distribution + + S_d: float + Sum of log degrees in the distribution that are larger than or equal to d_min. + + d_min: int + The minimum degree of nodes to consider + + Returns + ------- + float: the estimated log likelihood + """ + + return n * np.log(alpha) + n * alpha * np.log(d_min) - (alpha + 1) * S_d + + +def filter_singletons(edges, adj): + """ + Filter edges that, if removed, would turn one or more nodes into singleton nodes. + + Parameters + ---------- + edges: np.array, shape [P, 2], dtype int, where P is the number of input edges. + The potential edges. + + adj: sp.sparse_matrix, shape [N,N] + The input adjacency matrix. + + Returns + ------- + np.array, shape [P, 2], dtype bool: + A binary vector of length len(edges), False values indicate that the edge at + the index generates singleton edges, and should thus be avoided. + + """ + + degs = np.squeeze(np.array(np.sum(adj,0))) + existing_edges = np.squeeze(np.array(adj.tocsr()[tuple(edges.T)])) + if existing_edges.size > 0: + edge_degrees = degs[np.array(edges)] + 2*(1-existing_edges[:,None]) - 1 + else: + edge_degrees = degs[np.array(edges)] + 1 + + zeros = edge_degrees == 0 + zeros_sum = zeros.sum(1) + return zeros_sum == 0 + + +def filter_chisquare(ll_ratios, cutoff): + return ll_ratios < cutoff \ No newline at end of file diff --git a/src/attacks/nettack/utils.py b/src/attacks/nettack/utils.py new file mode 100644 index 0000000..d436908 --- /dev/null +++ b/src/attacks/nettack/utils.py @@ -0,0 +1,112 @@ +import torch +import torch.nn.functional as F +from torch_geometric.nn import GCNConv + +import numpy as np +import scipy.sparse as sp +from scipy.sparse.csgraph import connected_components +from tqdm import tqdm + + +class GNNLinear(torch.nn.Module): + def __init__(self, num_features, hidden, num_classes): + super(GNNLinear, self).__init__() + + # Initialize the layers + self.conv1 = GCNConv(num_features, hidden, add_self_loops=False, bias=False) + self.conv2 = GCNConv(hidden, num_classes, add_self_loops=False, bias=False) + + def forward(self, x=None, edge_index=None, **kwargs): + x = self.conv1(x, edge_index) + x = self.conv2(x, edge_index) + x = F.log_softmax(x, dim=1) + return x + + +def preprocess_graph(adj): + adj_ = adj + sp.eye(adj.shape[0]) + rowsum = adj_.sum(1).A1 + degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5)) + adj_normalized = adj_.dot(degree_mat_inv_sqrt).T.dot(degree_mat_inv_sqrt).tocsr() + return adj_normalized + + +def largest_connected_components(adj, n_components=1): + """Select the largest connected components in the graph. + + Parameters + ---------- + sparse_graph : gust.SparseGraph + Input graph. + n_components : int, default 1 + Number of largest connected components to keep. + + Returns + ------- + sparse_graph : gust.SparseGraph + Subgraph of the input graph where only the nodes in largest n_components are kept. + + """ + _, component_indices = connected_components(adj) + component_sizes = np.bincount(component_indices) + components_to_keep = np.argsort(component_sizes)[::-1][:n_components] # reverse order to sort descending + nodes_to_keep = [ + idx for (idx, component) in enumerate(component_indices) if component in components_to_keep + + + ] + print("Selecting {0} largest connected components".format(n_components)) + return nodes_to_keep + + +def data_to_csr_matrix(data): + + # Create sparse matrix CSR for edges + adj_tensor = data.edge_index + + num_edges = adj_tensor.size(1) + num_vertices = data.x.size(0) + + # Dividing a tensor into rows and columns + rows = adj_tensor[0].numpy() + cols = adj_tensor[1].numpy() + + # Edge weights (default 1) + data_edges = [1] * num_edges + + # Creating a Sparse CSR Matrix + adj_matrix = sp.csr_matrix((data_edges, (rows, cols)), shape=(num_vertices, num_vertices)) + + # Create sparse matrix CSR for nodes + attr_matrix = sp.csr_matrix(data.x.numpy()) + labels = data.y.numpy() + + return adj_matrix, attr_matrix, labels + + +def learn_w1_w2(dataset): + data = dataset.dataset.data + # TODO передавать параметр hidden + model_gnn_lin = GNNLinear(dataset.num_node_features, 16, dataset.num_classes) + + optimizer = torch.optim.Adam(model_gnn_lin.parameters(), + lr=0.001, + weight_decay=5e-4) + + num_epochs = 2000 + print("Train surrogate model") + for epoch in tqdm(range(num_epochs)): + model_gnn_lin.train() + optimizer.zero_grad() + preds = model_gnn_lin(data.x, data.edge_index) + loss = F.cross_entropy(preds[data.train_mask], data.y[data.train_mask]) + loss.backward() + optimizer.step() + print("End training") + + W1 = model_gnn_lin.conv1.lin.weight.T + W2 = model_gnn_lin.conv2.lin.weight.T + return W1, W2 + + + diff --git a/src/attacks/poison_attacks.py b/src/attacks/poison_attacks.py index fc8b972..15cc5ac 100644 --- a/src/attacks/poison_attacks.py +++ b/src/attacks/poison_attacks.py @@ -1,13 +1,112 @@ import numpy as np +import torch from attacks.attack_base import Attacker +# Nettack imports +from src.attacks.nettack.nettack import Nettack +from src.attacks.nettack.utils import preprocess_graph, largest_connected_components, data_to_csr_matrix, learn_w1_w2 +from torch_geometric.data import Data +# Nettack imports end + class PoisonAttacker(Attacker): def __init__(self, **kwargs): super().__init__() +class NettackPoisonAttack(PoisonAttacker): + name = "NettackPoisonAttack" + + def __init__(self, + node_idx=0, + direct_attack=True, + n_influencers=5, + perturb_features=True, + perturb_structure=True): + + super().__init__() + self.attack_diff = None + self.node_idx = node_idx + self.direct_attack = direct_attack + self.n_influencers = n_influencers + self.perturb_features = perturb_features + self.perturb_structure = perturb_structure + + def attack(self, gen_dataset): + # Prepare + data = gen_dataset.data + _A_obs, _X_obs, _z_obs = data_to_csr_matrix(data) + _A_obs = _A_obs + _A_obs.T + _A_obs[_A_obs > 1] = 1 + lcc = largest_connected_components(_A_obs) + + _A_obs = _A_obs[lcc][:, lcc] + + assert np.abs(_A_obs - _A_obs.T).sum() == 0, "Input graph is not symmetric" + assert _A_obs.max() == 1 and len(np.unique(_A_obs[_A_obs.nonzero()].A1)) == 1, "Graph must be unweighted" + assert _A_obs.sum(0).A1.min() > 0, "Graph contains singleton nodes" + + _X_obs = _X_obs[lcc].astype('float32') + _z_obs = _z_obs[lcc] + _N = _A_obs.shape[0] + _K = _z_obs.max() + 1 + _Z_obs = np.eye(_K)[_z_obs] + _An = preprocess_graph(_A_obs) + sizes = [16, _K] + degrees = _A_obs.sum(0).A1 + n_perturbations = int(degrees[self.node_idx]) + # n_perturbations = 3 + # End prepare + + # Learn matrix W1 and W2 + W1, W2 = learn_w1_w2(gen_dataset) + + # Attack + nettack = Nettack(_A_obs, _X_obs, _z_obs, W1, W2, self.node_idx, verbose=True) + + nettack.reset() + nettack.attack_surrogate(n_perturbations, + perturb_structure=self.perturb_structure, + perturb_features=self.perturb_features, + direct=self.direct_attack, + n_influencers=self.n_influencers) + + print(f'edges: {nettack.structure_perturbations}') + print(f'features: {nettack.feature_perturbations}') + + self._poisoning(gen_dataset, nettack.feature_perturbations, nettack.structure_perturbations) + self.attack_diff = gen_dataset + + return gen_dataset + + def attack_diff(self): + return self.attack_diff + + @staticmethod + def _poisoning(gen_dataset, feature_perturbations, structure_perturbations): + cleaned_feat_pert = list(filter(None, feature_perturbations)) + if cleaned_feat_pert: # list is not empty + x = gen_dataset.data.x.clone() + for vertex, feature in cleaned_feat_pert: + if x[vertex, feature] == 0.0: + x[vertex, feature] = 1.0 + elif x[vertex, feature] == 1.0: + x[vertex, feature] = 0.0 + gen_dataset.data.x = x + + cleaned_struct_pert = list(filter(None, structure_perturbations)) + if cleaned_struct_pert: # list is not empty + edge_index = gen_dataset.data.edge_index.clone() + # add edges + for edge in cleaned_struct_pert: + edge_index = torch.cat((edge_index, + torch.tensor((edge[0], edge[1]), dtype=torch.int32).to(torch.int64).unsqueeze(1)), dim=1) + edge_index = torch.cat((edge_index, + torch.tensor((edge[1], edge[0]), dtype=torch.int32).to(torch.int64).unsqueeze(1)), dim=1) + gen_dataset.data.edge_index = edge_index + + class RandomPoisonAttack(PoisonAttacker): name = "RandomPoisonAttack" diff --git a/src/models_builder/models_zoo.py b/src/models_builder/models_zoo.py index a8a4996..07dad7f 100644 --- a/src/models_builder/models_zoo.py +++ b/src/models_builder/models_zoo.py @@ -283,7 +283,7 @@ def model_configs_zoo(dataset, model_name): 'layer_name': 'GCNConv', 'layer_kwargs': { 'in_channels': dataset.num_node_features, - 'out_channels': 7, + 'out_channels': 16, }, }, 'activation': { @@ -297,7 +297,7 @@ def model_configs_zoo(dataset, model_name): 'layer': { 'layer_name': 'GCNConv', 'layer_kwargs': { - 'in_channels': 7, + 'in_channels': 16, 'out_channels': dataset.num_classes, }, }, From 4df4669c09d519cdebe7111f233e061dc7cb7c05 Mon Sep 17 00:00:00 2001 From: mikhail Date: Tue, 13 Aug 2024 13:33:01 +0300 Subject: [PATCH 2/9] add some changes --- experiments/attack_defense_test.py | 25 +++++++++---------------- metainfo/poison_attack_parameters.json | 9 ++++----- src/attacks/nettack/nettack.py | 4 ++-- src/attacks/nettack/utils.py | 6 +++--- src/attacks/poison_attacks.py | 15 ++++++++------- 5 files changed, 26 insertions(+), 33 deletions(-) diff --git a/experiments/attack_defense_test.py b/experiments/attack_defense_test.py index a8dfa60..d73fb2c 100644 --- a/experiments/attack_defense_test.py +++ b/experiments/attack_defense_test.py @@ -188,18 +188,12 @@ def test_nettack_attack(): dataset_ver_ind=0 ) - n = dataset.data.x.size(0) - train_mask = torch.full((2708,), True, dtype=torch.bool) - num_false_elements = int(n * 0.3) - indices = torch.randperm(n)[1:num_false_elements] - train_mask.index_fill_(0, indices, False) - train_mask[node_idx] = False - - dataset.train_mask = train_mask - dataset.test_mask =~ train_mask - - dataset.val_mask = torch.full((2708,), False, dtype=torch.bool) - # dataset.test_mask = torch.full((2708,), False, dtype=torch.bool) + # Create mask + train_test_split_coeff = 0.7 + train_mask = torch.rand(data.x.size(0)) < train_test_split_coeff # 70% True, 30% False + data.train_mask = train_mask + data.test_mask = ~ train_mask + data.val_mask = torch.zeros(data.x.size(0), dtype=torch.bool) # Train model on original dataset and remember the model metric and node predicted probability gcn_gcn = model_configs_zoo(dataset=dataset, model_name='gcn_gcn') @@ -231,7 +225,7 @@ def test_nettack_attack(): metrics=[Metric("F1", mask='train', average=None)]) # save train_test_mask to test the model on poisoned data with the same split - dataset.save_train_test_mask(train_test_split_path) # TODO сделать сохранение разбиения test/train + # dataset.save_train_test_mask(train_test_split_path) # TODO сделать сохранение разбиения test/train metric_original_dataset = gcn_gcn_model_manager.evaluate_model( gen_dataset=dataset, @@ -258,10 +252,9 @@ def test_nettack_attack(): _config_class="PoisonAttackConfig", _config_kwargs={ "node_idx": node_idx, - "direct_attack": True, - "n_influencers": 5, "perturb_features": True, - "perturb_structure": True + "perturb_structure": True, + "hidden": gcn_gcn_model_manager.gnn.GCNConv_0.out_channels } ) new_gcn_gcn = model_configs_zoo(dataset=dataset, model_name='gcn_gcn') diff --git a/metainfo/poison_attack_parameters.json b/metainfo/poison_attack_parameters.json index ab032c7..405a3d9 100644 --- a/metainfo/poison_attack_parameters.json +++ b/metainfo/poison_attack_parameters.json @@ -5,11 +5,10 @@ "n_edges_percent": ["n_edges_percent", "float", 0.1, {"min": 0.0001, "step": 0.01}, "?"] }, "NettackPoisonAttack": { - "node_idx": ["node_idx", "int", 0, {"min": 0, "step": 1}, "Index of teh node to attack"], - "direct_attack": ["direct_attack", "bool", true, {}, "?"], - "n_influencers": ["n_influencers", "int", 5, {"min": 1, "step": 1}, "?"], - "perturb_features": ["perturb_features", "bool", true, {}, "?"], - "perturb_structure": ["perturb_structure", "bool", true, {}, "?"] + "node_idx": ["node_idx", "int", 0, {"min": 0, "step": 1}, "Index of the node to attack"], + "perturb_features": ["perturb_features", "bool", true, {}, "Indicates whether the features can be changed"], + "perturb_structure": ["perturb_structure", "bool", true, {}, "Indicates whether the structure can be changed"], + "hidden": ["hidden", "int", 16, {"min": 1, "step": 1}, "Dimension of hidden layer"] } } diff --git a/src/attacks/nettack/nettack.py b/src/attacks/nettack/nettack.py index 11784e7..4ba5870 100644 --- a/src/attacks/nettack/nettack.py +++ b/src/attacks/nettack/nettack.py @@ -468,7 +468,6 @@ def reset(self): self.cooc_constraint = None -# @jit(nopython=True) def connected_after(u, v, connected_before, delta): if u == v: if delta == -1: @@ -478,7 +477,7 @@ def connected_after(u, v, connected_before, delta): else: return connected_before -# @jit(nopython=True) + def compute_new_a_hat_uv(edge_ixs, node_nb_ixs, edges_set, twohop_ixs, values_before, degs, potential_edges, u): """ Compute the new values [A_hat_square]_u for every potential edge, where u is the target node. C.f. Theorem 5.1 @@ -575,6 +574,7 @@ def compute_new_a_hat_uv(edge_ixs, node_nb_ixs, edges_set, twohop_ixs, values_be return return_ixs, return_values + def compute_alpha(n, S_d, d_min): """ Approximate the alpha of a power law distribution. diff --git a/src/attacks/nettack/utils.py b/src/attacks/nettack/utils.py index d436908..e6f3b91 100644 --- a/src/attacks/nettack/utils.py +++ b/src/attacks/nettack/utils.py @@ -84,10 +84,10 @@ def data_to_csr_matrix(data): return adj_matrix, attr_matrix, labels -def learn_w1_w2(dataset): - data = dataset.dataset.data +def learn_w1_w2(dataset, hidden): + data = dataset.data # TODO передавать параметр hidden - model_gnn_lin = GNNLinear(dataset.num_node_features, 16, dataset.num_classes) + model_gnn_lin = GNNLinear(dataset.num_node_features, hidden, dataset.num_classes) optimizer = torch.optim.Adam(model_gnn_lin.parameters(), lr=0.001, diff --git a/src/attacks/poison_attacks.py b/src/attacks/poison_attacks.py index 15cc5ac..42193a6 100644 --- a/src/attacks/poison_attacks.py +++ b/src/attacks/poison_attacks.py @@ -20,18 +20,18 @@ class NettackPoisonAttack(PoisonAttacker): def __init__(self, node_idx=0, - direct_attack=True, - n_influencers=5, perturb_features=True, - perturb_structure=True): + perturb_structure=True, + hidden=16): super().__init__() self.attack_diff = None self.node_idx = node_idx - self.direct_attack = direct_attack - self.n_influencers = n_influencers self.perturb_features = perturb_features self.perturb_structure = perturb_structure + self.hidden = hidden + self.direct_attack = True + self.n_influencers = 0 def attack(self, gen_dataset): # Prepare @@ -53,14 +53,15 @@ def attack(self, gen_dataset): _K = _z_obs.max() + 1 _Z_obs = np.eye(_K)[_z_obs] _An = preprocess_graph(_A_obs) - sizes = [16, _K] degrees = _A_obs.sum(0).A1 n_perturbations = int(degrees[self.node_idx]) # n_perturbations = 3 # End prepare # Learn matrix W1 and W2 - W1, W2 = learn_w1_w2(gen_dataset) + # TODO Here I need access to the model, namely the value of the hidden layer. + # Therefore I will pass it to the attack init + W1, W2 = learn_w1_w2(gen_dataset, self.hidden) # Attack nettack = Nettack(_A_obs, _X_obs, _z_obs, W1, W2, self.node_idx, verbose=True) From 940f0ebe342d296cb47c1f3ed696ee14c5141476 Mon Sep 17 00:00:00 2001 From: mikhail Date: Wed, 14 Aug 2024 17:01:57 +0300 Subject: [PATCH 3/9] remake Nettack test for evasion --- experiments/attack_defense_test.py | 112 ++++++++++-------------- metainfo/evasion_attack_parameters.json | 8 ++ metainfo/poison_attack_parameters.json | 6 -- src/attacks/evasion_attacks.py | 103 ++++++++++++++++++++++ src/attacks/nettack/utils.py | 16 ++-- src/attacks/poison_attacks.py | 107 +--------------------- 6 files changed, 167 insertions(+), 185 deletions(-) diff --git a/experiments/attack_defense_test.py b/experiments/attack_defense_test.py index d73fb2c..cbcb6e5 100644 --- a/experiments/attack_defense_test.py +++ b/experiments/attack_defense_test.py @@ -175,12 +175,9 @@ def test_attack_defense(): print(metric_loc) -def test_nettack_attack(): +def test_nettack_evasion(): my_device = device('cpu') - # Choose the node index - node_idx = 1900 - # Load dataset full_name = ("single-graph", "Planetoid", 'Cora') dataset, data, results_dataset_path = DatasetManager.get_by_full_name( @@ -188,13 +185,6 @@ def test_nettack_attack(): dataset_ver_ind=0 ) - # Create mask - train_test_split_coeff = 0.7 - train_mask = torch.rand(data.x.size(0)) < train_test_split_coeff # 70% True, 30% False - data.train_mask = train_mask - data.test_mask = ~ train_mask - data.val_mask = torch.zeros(data.x.size(0), dtype=torch.bool) - # Train model on original dataset and remember the model metric and node predicted probability gcn_gcn = model_configs_zoo(dataset=dataset, model_name='gcn_gcn') @@ -209,99 +199,85 @@ def test_nettack_attack(): } ) - steps_epochs = 2000 - gcn_gcn_model_manager = FrameworkGNNModelManager( + gnn_model_manager = FrameworkGNNModelManager( gnn=gcn_gcn, dataset_path=results_dataset_path, manager_config=manager_config, modification=ModelModificationConfig(model_ver_ind=0, epochs=0) ) - gcn_gcn_model_manager.gnn.to(my_device) + gnn_model_manager.gnn.to(my_device) - train_test_split_path = gcn_gcn_model_manager.train_model(gen_dataset=dataset, - steps=steps_epochs, - save_model_flag=False, - metrics=[Metric("F1", mask='train', average=None)]) + num_steps = 200 + gnn_model_manager.train_model(gen_dataset=dataset, + steps=num_steps, + save_model_flag=False) - # save train_test_mask to test the model on poisoned data with the same split - # dataset.save_train_test_mask(train_test_split_path) # TODO сделать сохранение разбиения test/train + # Evaluate model + acc_train = gnn_model_manager.evaluate_model(gen_dataset=dataset, + metrics=[Metric("Accuracy", mask='train')])['train']['Accuracy'] + acc_test = gnn_model_manager.evaluate_model(gen_dataset=dataset, + metrics=[Metric("Accuracy", mask='test')])['test']['Accuracy'] + print(f"Accuracy on train: {acc_train}. Accuracy on test: {acc_test}") - metric_original_dataset = gcn_gcn_model_manager.evaluate_model( - gen_dataset=dataset, - metrics=[Metric("Accuracy", mask='test')])['test'] + # Node for attack + node_idx = 0 - gcn_gcn_model_manager.gnn.eval() + # Model prediction on a node before an evasion attack on it + gnn_model_manager.gnn.eval() with torch.no_grad(): - probabilities = torch.exp(gcn_gcn_model_manager.gnn(dataset.data.x, dataset.data.edge_index)) + probabilities = torch.exp(gnn_model_manager.gnn(dataset.data.x, dataset.data.edge_index)) predicted_class = probabilities[node_idx].argmax().item() predicted_probability = probabilities[node_idx][predicted_class].item() real_class = dataset.data.y[node_idx].item() - original_dataset_predictions_info = {"metric_original_dataset": metric_original_dataset, - "node_idx": node_idx, - "predicted_class": predicted_class, - "predicted_probability": predicted_probability, - "real_class": real_class} + info_before_evasion_attack = {"node_idx": node_idx, + "predicted_class": predicted_class, + "predicted_probability": predicted_probability, + "real_class": real_class} - # Attack - nettack_poison_attack_config = ConfigPattern( - _class_name="NettackPoisonAttack", - _import_path=POISON_ATTACK_PARAMETERS_PATH, - _config_class="PoisonAttackConfig", + # Attack config + evasion_attack_config = ConfigPattern( + _class_name="NettackEvasionAttacker", + _import_path=EVASION_ATTACK_PARAMETERS_PATH, + _config_class="EvasionAttackConfig", _config_kwargs={ "node_idx": node_idx, + "n_perturbations": 20, "perturb_features": True, "perturb_structure": True, - "hidden": gcn_gcn_model_manager.gnn.GCNConv_0.out_channels + "direct": True, + "n_influencers": 0 } ) - new_gcn_gcn = model_configs_zoo(dataset=dataset, model_name='gcn_gcn') - new_gcn_gcn_model_manager = FrameworkGNNModelManager( - gnn=new_gcn_gcn, - dataset_path=results_dataset_path, - manager_config=manager_config, - modification=ModelModificationConfig(model_ver_ind=0, epochs=0) - ) - new_gcn_gcn_model_manager.set_poison_attacker(poison_attack_config=nettack_poison_attack_config) + gnn_model_manager.set_evasion_attacker(evasion_attack_config=evasion_attack_config) - # TODO сделать сохранение разбиения test/train - # train_mask, val_mask, test_mask, train_test_sizes = torch.load(train_test_split_path / 'train_test_split')[:] - # dataset.train_mask, dataset.val_mask, dataset.test_mask = train_mask, val_mask, test_mask - # data.percent_train_class, data.percent_test_class = train_test_sizes - - new_gcn_gcn_model_manager.train_model(gen_dataset=dataset, - steps=steps_epochs, - save_model_flag=False, - metrics=[Metric("F1", mask='train', average=None)]) - - metric_poison_dataset = new_gcn_gcn_model_manager.evaluate_model( - gen_dataset=new_gcn_gcn_model_manager.poison_attacker.attack_diff, - metrics=[Metric("Accuracy", mask='test')])['test'] + # Attack + gnn_model_manager.evaluate_model(gen_dataset=dataset, metrics=[Metric("F1", mask='test', average='macro')]) - new_gcn_gcn_model_manager.gnn.eval() + # Model prediction on a node after an evasion attack on it with torch.no_grad(): - probabilities = torch.exp(new_gcn_gcn_model_manager.gnn(new_gcn_gcn_model_manager.poison_attacker.attack_diff.data.x, - new_gcn_gcn_model_manager.poison_attacker.attack_diff.data.edge_index)) + probabilities = torch.exp(gnn_model_manager.gnn(gnn_model_manager.evasion_attacker.attack_diff.data.x, + gnn_model_manager.evasion_attacker.attack_diff.data.edge_index)) predicted_class = probabilities[node_idx].argmax().item() predicted_probability = probabilities[node_idx][predicted_class].item() real_class = dataset.data.y[node_idx].item() - poisoned_dataset_predictions_info = {"metric_poison_dataset": metric_poison_dataset, - "node_idx": node_idx, - "predicted_class": predicted_class, - "predicted_probability": predicted_probability, - "real_class": real_class} + info_after_evasion_attack = {"node_idx": node_idx, + "predicted_class": predicted_class, + "predicted_probability": predicted_probability, + "real_class": real_class} - print(original_dataset_predictions_info) - print(poisoned_dataset_predictions_info) + print(f"info_before_evasion_attack: {info_before_evasion_attack}") + print(f"info_after_evasion_attack: {info_after_evasion_attack}") if __name__ == '__main__': # test_attack_defense() - test_nettack_attack() + test_nettack_evasion() + diff --git a/metainfo/evasion_attack_parameters.json b/metainfo/evasion_attack_parameters.json index b2ce9cc..3d54f1b 100644 --- a/metainfo/evasion_attack_parameters.json +++ b/metainfo/evasion_attack_parameters.json @@ -3,6 +3,14 @@ }, "FGSM": { "epsilon": ["epsilon", "float", 0.1, {"min": 0.0001, "step": 0.01}, "?"] + }, + "NettackEvasionAttacker": { + "node_idx": ["node_idx", "int", 0, {"min": 0, "step": 1}, "Index of the node to attack"], + "n_perturbations": ["n_perturbations", "int", null, {"min": 0, "step": 1}, "Number of perturbations. If None, then n_perturbations = degree(node_idx)"], + "perturb_features": ["perturb_features", "bool", true, {}, "Indicates whether the features can be changed"], + "perturb_structure": ["perturb_structure", "bool", true, {}, "Indicates whether the structure can be changed"], + "direct": ["direct", "bool", true, {}, "Indicates whether to directly modify edges/features of the node attacked or only those of influencers"], + "n_influencers": ["n_influencers", "int", 0, {"min": 0, "step": 1}, "Number of influencing nodes. Will be ignored if direct is True"] } } diff --git a/metainfo/poison_attack_parameters.json b/metainfo/poison_attack_parameters.json index 405a3d9..980cabd 100644 --- a/metainfo/poison_attack_parameters.json +++ b/metainfo/poison_attack_parameters.json @@ -3,12 +3,6 @@ }, "RandomPoisonAttack": { "n_edges_percent": ["n_edges_percent", "float", 0.1, {"min": 0.0001, "step": 0.01}, "?"] - }, - "NettackPoisonAttack": { - "node_idx": ["node_idx", "int", 0, {"min": 0, "step": 1}, "Index of the node to attack"], - "perturb_features": ["perturb_features", "bool", true, {}, "Indicates whether the features can be changed"], - "perturb_structure": ["perturb_structure", "bool", true, {}, "Indicates whether the structure can be changed"], - "hidden": ["hidden", "int", 16, {"min": 1, "step": 1}, "Dimension of hidden layer"] } } diff --git a/src/attacks/evasion_attacks.py b/src/attacks/evasion_attacks.py index e0f85d0..a5be784 100644 --- a/src/attacks/evasion_attacks.py +++ b/src/attacks/evasion_attacks.py @@ -1,8 +1,13 @@ import torch import torch.nn.functional as F +import numpy as np from attacks.attack_base import Attacker +# Nettack imports +from src.attacks.nettack.nettack import Nettack +from src.attacks.nettack.utils import preprocess_graph, largest_connected_components, data_to_csr_matrix, train_w1_w2 + class EvasionAttacker(Attacker): def __init__(self, **kwargs): @@ -35,3 +40,101 @@ def attack(self, model_manager, gen_dataset, mask_tensor): perturbed_data_x = torch.clamp(perturbed_data_x, 0, 1) gen_dataset.data.x = perturbed_data_x.detach() return gen_dataset + + +class NettackEvasionAttacker(EvasionAttacker): + name = "NettackEvasionAttacker" + + def __init__(self, + node_idx=0, + n_perturbations=None, + perturb_features=True, + perturb_structure=True, + direct=True, + n_influencers=0 + ): + + super().__init__() + self.attack_diff = None + self.node_idx = node_idx + self.n_perturbations = n_perturbations + self.perturb_features = perturb_features + self.perturb_structure = perturb_structure + self.direct = direct + self.n_influencers = n_influencers + + def attack(self, model_manager, gen_dataset, mask_tensor): + # Prepare + data = gen_dataset.data + _A_obs, _X_obs, _z_obs = data_to_csr_matrix(data) + _A_obs = _A_obs + _A_obs.T + _A_obs[_A_obs > 1] = 1 + lcc = largest_connected_components(_A_obs) + + _A_obs = _A_obs[lcc][:, lcc] + + assert np.abs(_A_obs - _A_obs.T).sum() == 0, "Input graph is not symmetric" + assert _A_obs.max() == 1 and len(np.unique(_A_obs[_A_obs.nonzero()].A1)) == 1, "Graph must be unweighted" + assert _A_obs.sum(0).A1.min() > 0, "Graph contains singleton nodes" + + _X_obs = _X_obs[lcc].astype('float32') + _z_obs = _z_obs[lcc] + _N = _A_obs.shape[0] + _K = _z_obs.max() + 1 + _Z_obs = np.eye(_K)[_z_obs] + _An = preprocess_graph(_A_obs) + degrees = _A_obs.sum(0).A1 + + if self.n_perturbations is None: + self.n_perturbations = int(degrees[self.node_idx]) + hidden = model_manager.gnn.GCNConv_0.out_channels + # End prepare + + # Learn matrix W1 and W2 + W1, W2 = train_w1_w2(dataset=gen_dataset, hidden=hidden) + + # Attack + nettack = Nettack(_A_obs, _X_obs, _z_obs, W1, W2, self.node_idx, verbose=True) + + nettack.reset() + nettack.attack_surrogate(n_perturbations=self.n_perturbations, + perturb_structure=self.perturb_structure, + perturb_features=self.perturb_features, + direct=self.direct, + n_influencers=self.n_influencers) + + print(f'edges: {nettack.structure_perturbations}') + print(f'features: {nettack.feature_perturbations}') + + self._evasion(gen_dataset, nettack.feature_perturbations, nettack.structure_perturbations) + self.attack_diff = gen_dataset + + return gen_dataset + + def attack_diff(self): + return self.attack_diff + + @staticmethod + def _evasion(gen_dataset, feature_perturbations, structure_perturbations): + cleaned_feat_pert = list(filter(None, feature_perturbations)) + if cleaned_feat_pert: # list is not empty + x = gen_dataset.data.x.clone() + for vertex, feature in cleaned_feat_pert: + if x[vertex, feature] == 0.0: + x[vertex, feature] = 1.0 + elif x[vertex, feature] == 1.0: + x[vertex, feature] = 0.0 + gen_dataset.data.x = x + + cleaned_struct_pert = list(filter(None, structure_perturbations)) + if cleaned_struct_pert: # list is not empty + edge_index = gen_dataset.data.edge_index.clone() + # add edges + for edge in cleaned_struct_pert: + edge_index = torch.cat((edge_index, + torch.tensor((edge[0], edge[1]), dtype=torch.int32).to(torch.int64).unsqueeze(1)), dim=1) + edge_index = torch.cat((edge_index, + torch.tensor((edge[1], edge[0]), dtype=torch.int32).to(torch.int64).unsqueeze(1)), dim=1) + + gen_dataset.data.edge_index = edge_index + \ No newline at end of file diff --git a/src/attacks/nettack/utils.py b/src/attacks/nettack/utils.py index e6f3b91..3ff10c8 100644 --- a/src/attacks/nettack/utils.py +++ b/src/attacks/nettack/utils.py @@ -13,12 +13,12 @@ def __init__(self, num_features, hidden, num_classes): super(GNNLinear, self).__init__() # Initialize the layers - self.conv1 = GCNConv(num_features, hidden, add_self_loops=False, bias=False) - self.conv2 = GCNConv(hidden, num_classes, add_self_loops=False, bias=False) + self.conv0 = GCNConv(num_features, hidden, add_self_loops=False, bias=False) + self.conv1 = GCNConv(hidden, num_classes, add_self_loops=False, bias=False) def forward(self, x=None, edge_index=None, **kwargs): + x = self.conv0(x, edge_index) x = self.conv1(x, edge_index) - x = self.conv2(x, edge_index) x = F.log_softmax(x, dim=1) return x @@ -84,16 +84,16 @@ def data_to_csr_matrix(data): return adj_matrix, attr_matrix, labels -def learn_w1_w2(dataset, hidden): +def train_w1_w2(dataset, hidden): data = dataset.data - # TODO передавать параметр hidden + model_gnn_lin = GNNLinear(dataset.num_node_features, hidden, dataset.num_classes) optimizer = torch.optim.Adam(model_gnn_lin.parameters(), lr=0.001, weight_decay=5e-4) - num_epochs = 2000 + num_epochs = 1000 print("Train surrogate model") for epoch in tqdm(range(num_epochs)): model_gnn_lin.train() @@ -104,8 +104,8 @@ def learn_w1_w2(dataset, hidden): optimizer.step() print("End training") - W1 = model_gnn_lin.conv1.lin.weight.T - W2 = model_gnn_lin.conv2.lin.weight.T + W1 = model_gnn_lin.conv0.lin.weight.T + W2 = model_gnn_lin.conv1.lin.weight.T return W1, W2 diff --git a/src/attacks/poison_attacks.py b/src/attacks/poison_attacks.py index 42193a6..f8dc546 100644 --- a/src/attacks/poison_attacks.py +++ b/src/attacks/poison_attacks.py @@ -3,109 +3,17 @@ from attacks.attack_base import Attacker -# Nettack imports -from src.attacks.nettack.nettack import Nettack -from src.attacks.nettack.utils import preprocess_graph, largest_connected_components, data_to_csr_matrix, learn_w1_w2 -from torch_geometric.data import Data -# Nettack imports end - class PoisonAttacker(Attacker): def __init__(self, **kwargs): super().__init__() -class NettackPoisonAttack(PoisonAttacker): - name = "NettackPoisonAttack" - - def __init__(self, - node_idx=0, - perturb_features=True, - perturb_structure=True, - hidden=16): - - super().__init__() - self.attack_diff = None - self.node_idx = node_idx - self.perturb_features = perturb_features - self.perturb_structure = perturb_structure - self.hidden = hidden - self.direct_attack = True - self.n_influencers = 0 - - def attack(self, gen_dataset): - # Prepare - data = gen_dataset.data - _A_obs, _X_obs, _z_obs = data_to_csr_matrix(data) - _A_obs = _A_obs + _A_obs.T - _A_obs[_A_obs > 1] = 1 - lcc = largest_connected_components(_A_obs) - - _A_obs = _A_obs[lcc][:, lcc] - - assert np.abs(_A_obs - _A_obs.T).sum() == 0, "Input graph is not symmetric" - assert _A_obs.max() == 1 and len(np.unique(_A_obs[_A_obs.nonzero()].A1)) == 1, "Graph must be unweighted" - assert _A_obs.sum(0).A1.min() > 0, "Graph contains singleton nodes" - - _X_obs = _X_obs[lcc].astype('float32') - _z_obs = _z_obs[lcc] - _N = _A_obs.shape[0] - _K = _z_obs.max() + 1 - _Z_obs = np.eye(_K)[_z_obs] - _An = preprocess_graph(_A_obs) - degrees = _A_obs.sum(0).A1 - n_perturbations = int(degrees[self.node_idx]) - # n_perturbations = 3 - # End prepare - - # Learn matrix W1 and W2 - # TODO Here I need access to the model, namely the value of the hidden layer. - # Therefore I will pass it to the attack init - W1, W2 = learn_w1_w2(gen_dataset, self.hidden) - - # Attack - nettack = Nettack(_A_obs, _X_obs, _z_obs, W1, W2, self.node_idx, verbose=True) - - nettack.reset() - nettack.attack_surrogate(n_perturbations, - perturb_structure=self.perturb_structure, - perturb_features=self.perturb_features, - direct=self.direct_attack, - n_influencers=self.n_influencers) - - print(f'edges: {nettack.structure_perturbations}') - print(f'features: {nettack.feature_perturbations}') - - self._poisoning(gen_dataset, nettack.feature_perturbations, nettack.structure_perturbations) - self.attack_diff = gen_dataset - - return gen_dataset - - def attack_diff(self): - return self.attack_diff - - @staticmethod - def _poisoning(gen_dataset, feature_perturbations, structure_perturbations): - cleaned_feat_pert = list(filter(None, feature_perturbations)) - if cleaned_feat_pert: # list is not empty - x = gen_dataset.data.x.clone() - for vertex, feature in cleaned_feat_pert: - if x[vertex, feature] == 0.0: - x[vertex, feature] = 1.0 - elif x[vertex, feature] == 1.0: - x[vertex, feature] = 0.0 - gen_dataset.data.x = x +class EmptyPoisonAttacker(PoisonAttacker): + name = "EmptyPoisonAttacker" - cleaned_struct_pert = list(filter(None, structure_perturbations)) - if cleaned_struct_pert: # list is not empty - edge_index = gen_dataset.data.edge_index.clone() - # add edges - for edge in cleaned_struct_pert: - edge_index = torch.cat((edge_index, - torch.tensor((edge[0], edge[1]), dtype=torch.int32).to(torch.int64).unsqueeze(1)), dim=1) - edge_index = torch.cat((edge_index, - torch.tensor((edge[1], edge[0]), dtype=torch.int32).to(torch.int64).unsqueeze(1)), dim=1) - gen_dataset.data.edge_index = edge_index + def attack(self, **kwargs): + pass class RandomPoisonAttack(PoisonAttacker): @@ -134,10 +42,3 @@ def attack(self, gen_dataset): def attack_diff(self): return self.attack_diff - - -class EmptyPoisonAttacker(PoisonAttacker): - name = "EmptyPoisonAttacker" - - def attack(self, **kwargs): - pass From b5f0e08778df51d5bf4cda9904dd02e9c48e71c7 Mon Sep 17 00:00:00 2001 From: mikhail Date: Thu, 15 Aug 2024 13:36:38 +0300 Subject: [PATCH 4/9] add dependence in requirements3.txt --- requirements3.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements3.txt b/requirements3.txt index c4b5736..6c928cd 100644 --- a/requirements3.txt +++ b/requirements3.txt @@ -9,6 +9,7 @@ torch-spline-conv==1.2.2 # For explainers dive-into-graphs==1.1.0 pgmpy # PGMExplainer +google-generativeai # PGMExplainer # For synthetic datasets matplotlib From 741ab97f7ef2edb0bafa53866e64da3cb24a1f2b Mon Sep 17 00:00:00 2001 From: mikhail Date: Thu, 15 Aug 2024 14:08:17 +0300 Subject: [PATCH 5/9] add dependences in requirements3.txt --- requirements3.txt | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/requirements3.txt b/requirements3.txt index 6c928cd..ba51ea0 100644 --- a/requirements3.txt +++ b/requirements3.txt @@ -8,8 +8,15 @@ torch-spline-conv==1.2.2 # For explainers dive-into-graphs==1.1.0 -pgmpy # PGMExplainer -google-generativeai # PGMExplainer + +# PGMExplainer +pgmpy +google-generativeai +google +google-cloud +google-cloud-aiplatform +google-api-python-client +google-cloud-language # For synthetic datasets matplotlib From 7744c95c92c69d48a06aeb51f8c214b1d56d9db5 Mon Sep 17 00:00:00 2001 From: mikhail Date: Thu, 15 Aug 2024 16:39:37 +0300 Subject: [PATCH 6/9] add dependences in requirements3.txt --- requirements3.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements3.txt b/requirements3.txt index ba51ea0..23e2360 100644 --- a/requirements3.txt +++ b/requirements3.txt @@ -17,6 +17,7 @@ google-cloud google-cloud-aiplatform google-api-python-client google-cloud-language +google-ai-generativelanguage # For synthetic datasets matplotlib From 3204d95a6f4c6e5355eeb5f23c2e1058b8d904f4 Mon Sep 17 00:00:00 2001 From: mikhail Date: Thu, 15 Aug 2024 17:28:35 +0300 Subject: [PATCH 7/9] add dependences in requirements3.txt --- requirements3.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements3.txt b/requirements3.txt index 23e2360..501991d 100644 --- a/requirements3.txt +++ b/requirements3.txt @@ -18,6 +18,7 @@ google-cloud-aiplatform google-api-python-client google-cloud-language google-ai-generativelanguage +google-api-core # For synthetic datasets matplotlib From 4c7225796b783a6059b4baa1cae2865d7df270b4 Mon Sep 17 00:00:00 2001 From: mikhail Date: Thu, 15 Aug 2024 18:08:40 +0300 Subject: [PATCH 8/9] add dependences in requirements3.txt --- requirements3.txt | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/requirements3.txt b/requirements3.txt index 501991d..5c62a81 100644 --- a/requirements3.txt +++ b/requirements3.txt @@ -11,14 +11,32 @@ dive-into-graphs==1.1.0 # PGMExplainer pgmpy -google-generativeai +# google-generativeai +# google +# google-cloud +# google-cloud-aiplatform +# google-api-python-client +# google-cloud-language +# google-ai-generativelanguage +# google-api-core google -google-cloud -google-cloud-aiplatform -google-api-python-client -google-cloud-language -google-ai-generativelanguage +# google--async-resumable-media +# google--upb +google-ai +google-api google-api-core +google-auth +google-cloud +google-gapic +google-generativeai +goggle-iam +google-logging +google-longrunning +google-oauth2 +google-protobuf +google-resumable-media +google-rpc +google-type # For synthetic datasets matplotlib From 58797ca3f536ef1c58f16f2ba269315845da1223 Mon Sep 17 00:00:00 2001 From: mikhail Date: Fri, 23 Aug 2024 18:24:24 +0300 Subject: [PATCH 9/9] add dependence in requirements3.txt --- requirements3.txt | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/requirements3.txt b/requirements3.txt index 5c62a81..225c65a 100644 --- a/requirements3.txt +++ b/requirements3.txt @@ -10,33 +10,7 @@ torch-spline-conv==1.2.2 dive-into-graphs==1.1.0 # PGMExplainer -pgmpy -# google-generativeai -# google -# google-cloud -# google-cloud-aiplatform -# google-api-python-client -# google-cloud-language -# google-ai-generativelanguage -# google-api-core -google -# google--async-resumable-media -# google--upb -google-ai -google-api -google-api-core -google-auth -google-cloud -google-gapic -google-generativeai -goggle-iam -google-logging -google-longrunning -google-oauth2 -google-protobuf -google-resumable-media -google-rpc -google-type +pgmpy==0.1.24 # For synthetic datasets matplotlib