-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathchip_model_fitting.py
137 lines (110 loc) · 7.34 KB
/
chip_model_fitting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# -*- coding: utf-8 -*-
"""
@author: Makan Arastuie
"""
import time
import numpy as np
import dataset_utils
import generative_model_utils as utils
import model_fitting_utils as model_utils
def fit_and_eval_community_hawkes(train_tuple, test_tuple, combined_tuple, nodes_not_in_train,
k_values_to_test=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
local_search_max_iter=0, local_search_n_cores=-1,
plot_fitted_hist=False, verbose=False):
"""
Fits the CHIP model to train and evaluates the log-likelihood on the test, by evaluating the
log-likelihood on the combined dataset and subtracting the likelihood of train, dividing by number of events in test
:param train_tuple, test_tuple, combined_tuple: A tuple of (event dict, number of nodes, duration)
:param nodes_not_in_train: Nodes that are in the test data, but not in the train
:param k_values_to_test: iterable obj of number of communities to fit
:param local_search_max_iter: if >0, then the model is fitted using local search, else local search is not used.
:param local_search_n_cores: Number of cores to be used for local search. Ignored if local_search_max_iter <= 0.
:param plot_fitted_hist: If True, generates a CHIP model network based on the fitted parameters and plots a
histogram of the event count of real vs. fitted model.
:param verbose: Prints details of the fit along the way.
:return: (list) test log-likelihood per event for all `k_values_to_test`.
"""
train_event_dict, train_num_nodes, train_duration = train_tuple
test_event_dict, test_num_nodes, test_duration = test_tuple
combined_event_dict, combined_num_nodes, combined_duration = combined_tuple
total_tic = time.time()
print("Log-likelihoods per event:")
lls_per_event = []
for num_classes in k_values_to_test:
if verbose:
print("K:", num_classes)
tic = time.time()
# Fitting the model to the train data
train_node_membership, train_bp_mu, train_bp_alpha, train_bp_beta, train_block_pair_events = \
model_utils.fit_community_model(train_event_dict, train_num_nodes, train_duration, num_classes,
local_search_max_iter, local_search_n_cores,
verbose=verbose)
# Add nodes that were not in train to the largest block
combined_node_membership = model_utils.assign_node_membership_for_missing_nodes(train_node_membership,
nodes_not_in_train)
# Calculate log-likelihood given the entire dataset
combined_block_pair_events = utils.event_dict_to_block_pair_events(combined_event_dict,
combined_node_membership,
num_classes)
combined_log_likelihood = model_utils.calc_full_log_likelihood(combined_block_pair_events,
combined_node_membership,
train_bp_mu, train_bp_alpha, train_bp_beta,
combined_duration, num_classes)
# Calculate log-likelihood given the train dataset
train_log_likelihood = model_utils.calc_full_log_likelihood(train_block_pair_events, train_node_membership,
train_bp_mu, train_bp_alpha, train_bp_beta,
train_duration, num_classes)
# Calculate per event log likelihood
ll_per_event = model_utils.calc_per_event_log_likelihood(combined_log_likelihood, train_log_likelihood,
test_event_dict, test_num_nodes)
toc = time.time()
lls_per_event.append(ll_per_event)
# Print train and test log-likelihood per event
train_n_events = np.sum(utils.event_dict_to_aggregated_adjacency(train_num_nodes, train_event_dict))
print(f"K: {num_classes} - Train ll: {train_log_likelihood / train_n_events:.4f}", end=' - ')
print(f"Test ll: {ll_per_event:.3f} - Took: {toc - tic:.2f}s")
if plot_fitted_hist:
model_utils.generate_fit_community_hawkes(train_event_dict, train_node_membership,
train_bp_mu, train_bp_alpha, train_bp_beta,
train_duration,
plot_fitted_hist, n_cores=26)
total_toc = time.time()
print(f"Total time elapsed: {total_toc - total_tic:.2f}s")
return lls_per_event
# Examples of fitting CHIP to Facebook, Enron, Reality Mining and simulated data.
if __name__ == "__main__":
# Entire Facebook Dataset
print("Facebook wall-post dataset")
fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train = \
dataset_utils.load_facebook_wall(timestamp_max=1000, largest_connected_component_only=True,
train_percentage=0.8)
fit_and_eval_community_hawkes(fb_train_tuple, fb_test_tuple, fb_combined_tuple, fb_nodes_not_in_train,
k_values_to_test=np.arange(1, 201),
plot_fitted_hist=False, verbose=False)
# # Enron Dataset
# print("Enron dataset")
# enron_train_tuple, enron_test_tuple, enron_combined_tuple, enron_nodes_not_in_train = \
# dataset_utils.load_enron_train_test(remove_nodes_not_in_train=False)
# fit_and_eval_community_hawkes(enron_train_tuple, enron_test_tuple, enron_combined_tuple, enron_nodes_not_in_train,
# k_values_to_test=np.arange(1, enron_train_tuple[1]),
# plot_fitted_hist=False, verbose=False)
# Reality Mining
# print("Reality Mining")
# rm_train_tuple, rm_test_tuple, rm_combined_tuple, rm_nodes_not_in_train = \
# dataset_utils.load_reality_mining_test_train(remove_nodes_not_in_train=False)
# fit_and_eval_community_hawkes(rm_train_tuple, rm_test_tuple, rm_combined_tuple, rm_nodes_not_in_train,
# k_values_to_test=np.arange(1, rm_train_tuple[1]),
# local_search_max_iter=0, local_search_n_cores=34,
# plot_fitted_hist=False, verbose=False)
# Simulated Data
# print("Simulated Data:")
# sim_event_dict, sim_node_membership = utils.simulate_community_hawkes(params={'number_of_nodes': 128,
# 'end_time': 200})
# sim_event_list = utils.event_dict_to_event_list(sim_event_dict)
# sim_train_tuple, sim_test_tuple, sim_combined_tuple, sim_nodes_not_in_train = \
# dataset_utils.split_event_list_to_train_test(sim_event_list, train_percentage=0.8)
#
# print(sim_train_tuple[-1])
# print(sim_test_tuple[-1])
# print(sim_combined_tuple[-1])
# fit_and_eval_community_hawkes(sim_train_tuple, sim_test_tuple, sim_combined_tuple, sim_nodes_not_in_train)