-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathtsm.py
1033 lines (876 loc) · 59.8 KB
/
tsm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/python3
# TSM (Twitter Subgraph Manipulator) for Python 3
# release 9 (11/30/19)
# (c) 2014-2019 by Deen Freelon <[email protected]>
# Distributed under the BSD 3-clause license. See LICENSE.txt or http://opensource.org/licenses/BSD-3-Clause for details.
# This Python module contains a set of functions that create and manipulate Twitter and Twitter-like (directed, long-tailed, extremely sparse) network communities/subgraphs in various ways. The only Twitter-specific functions are t2e, get_top_rts, and get_top_hashtags; the rest can be used with any directed edgelist. It functions only with Python 3.x and is not backwards-compatible (although one could probably branch off a 2.x port with minimal effort).
# Warning: TSM performs no custom error-handling, so make sure your inputs are formatted properly! If you have questions, please let me know via email.
# FUNCTION LIST
# load_data: Loads data quickly
# save_csv: Quick CSV output
# t2e: Converts raw tweets into edgelist format (retweets and @-mentions, not follows)
# get_top_communities: A wrapper for Thomas Aynaud's implementation of the Louvain method for community detection. Gets the top k or (k*100)% largest communities by membership in a network and then outputs a file/variable containing the community labels and in-degrees of each user.
# calc_ei: Calculates the E-I index (a measure of insularity) of each community detected by get_top_communities
# _get_shared_ties: An extension of calc_ei that shows how "close" each community is to all of the others in terms of shared ties
# get_top_rts: Gets the most-retweeted tweets within each community
# match_communities: Compares community membership within two networks, A and B, and gives the best match found in B for each community in A
# get_intermediaries: Discovers which nodes intermediate between which communities
# get_top_hashtags: Gets the most-used hashtags in each community
# get_top_links: Gets the most-used hyperlinks or link domains in each community
# shared_ties_grid: Coaxes output of _get_shared_ties into a convenient grid format
# REQUIRED MODULES
#Below are all this module's dependencies. Everything except NetworkX and community comes standard with Python. You can get NetworkX here: http://networkx.github.io/ or through pip. You'll also need Thomas Aynaud's implementation of the Louvain method for community detection (python-louvain, which is where the community module lives), which is available here: https://bitbucket.org/taynaud/python-louvain or through pip. (Note that only the Py3-compliant version 0.4 of python-louvain will work with TSM.)
import collections
import community
import copy
import csv
import networkx as nx
import operator
import random
import re
# FUNCTIONS
# load_data: load data from a string or variable
# Arguments:
# data: If load_data is fed a string, it assumes it is a path to a CSV file and attempts to load the contents into a list of lists. If it is fed a non-string variable, it creates a deep copy.
# enc: the character encoding of the file you're trying to open. See https://docs.python.org/3.4/library/codecs.html#standard-encodings
# Output:
# A list of lists representing the contents of a CSV file, or a deep copy of a non-string variable.
def load_data(data,enc='utf-8',translate_unicode=False):
if type(data) is str:
csv_data = []
with open(data,'r',encoding = enc,errors = 'replace') as f:
if translate_unicode == True:
reader = csv.reader((line.encode().decode('unicode_escape').replace('\0','') for line in f)) #remove NULL bytes
else:
reader = csv.reader((line.replace('\0','') for line in f)) #remove NULL bytes
for row in reader:
if row != []:
csv_data.append(row)
print('Data loaded from file "' + data + '".')
return csv_data
else:
print('Data loaded.')
return copy.deepcopy(data)
# save_csv: save tabular data to a CSV file
# Arguments:
# filename: a string representing the filename to save to.
# data: a list of lists containing your data. If fed anything else, save_csv may behave erratically.
# use_quotes: If this flag is set to True, save_csv will add double quotes around each value before saving to disk. This flag will also convert all existing double quotes to single quotes to avoid delimiter confusion. If set to False, delimiting quotes will not be added.
# file_mode: a string variable representing any of the standard modes for the open function. See https://docs.python.org/3.4/library/functions.html#open
# enc: the character encoding for the file you're trying to save. See https://docs.python.org/3.4/library/codecs.html#standard-encodings
# Output:
# save_csv returns nothing, but should leave a text file in the Python's current working directory containing the data in the data variable, assuming that directory is writeable.
def save_csv(filename,data,use_quotes=False,double_to_single=True,file_mode='w',enc='utf-8',verbose=True): #this assumes a list of lists wherein the second-level list items contain no commas
with open(filename,file_mode,encoding = enc) as out:
for line in data:
if type(line) is not list and type(line) is not tuple:
line = [line] #forces all items in 'data' to be lists
if use_quotes == True:
if double_to_single == True:
row = '"' + '","'.join([str(i).replace('"',"'") for i in line]) + '"' + "\n"
else:
row = '"' + '","'.join([str(i) for i in line]) + '"' + "\n"
else:
row = ','.join([str(i) for i in line]) + "\n"
out.write(row)
if verbose == True:
print('Data saved to file "' + filename + '".')
# t2e: convert raw Twitter data to edgelist format
# Description: t2e takes raw Twitter data as input and outputs an edgelist consisting of the names of the tweet authors (col 1) and the names of the nodes mentioned and/or retweeted (col 2).
# Arguments:
# tweet_data: One of two things:
# 1. a string representing a path to a comma-delimited CSV file with tweet author screen names listed in col 1 and corresponding tweet text in col 2, OR
# 2. a list of lists wherein each second-degree list has a len of 2 with a tweet author screen name at index 0 and a corresponding fulltext tweet at index 1.
# extmode: see below
# enc: the character encoding of the file you're trying to open. See https://docs.python.org/3.4/library/codecs.html#standard-encodings
# save_prefix: Add a string here to save your file to CSV. Your saved file will be named as follows: 'string'_edgelist.csv
# Output: An edgelist in the form of a Python list of lists. If save_prefix is set, the edgelist will also be saved as a CSV file.
# t2e has four extraction modes (specified by the extmode variable). Default is ALL.
# ALL = do not differentiate between retweets and non-retweets, include isolates (default)
# ALL_NO_ISOLATES = do not differentiate between retweets and non-retweets, exclude isolates
# RTS_ONLY = retweets only, exclude isolates
# AT_MENTIONS_ONLY = non-retweets (@-mentions) only, exclude isolates
# REPLIES_ONLY = only tweets in which the first or second character is an "@", exclude isolates
def t2e(tweet_data,extmode='ALL',enc='utf-8',save_prefix=''):
authors = []
tweets = []
final = []
if type(tweet_data) is str:
f = open(tweet_data,'r',encoding = enc,errors = 'replace')
tweet_data = csv.reader(f)
for row in tweet_data:
if extmode.upper() == 'ALL_NO_ISOLATES':
condition = '@' in row[1]
elif extmode.upper() == 'RTS_ONLY':
condition = 'rt @' in row[1].lower()
elif extmode.upper() == 'AT_MENTIONS_ONLY':
condition = '@' in row[1] and 'rt @' not in row[1].lower()
elif extmode.upper() == 'REPLIES_ONLY':
condition = row[1][0] == '@' or row[1][1] == '@'
else:
condition = True
if condition is True:
authors.append(re.sub('[^A-Za-z0-9_]','',str(row[0]).lower().strip()))
tweets.append(' ' + row[1].lower() + ' ')
if type(tweet_data) is str:
f.close()
if extmode == 'RTS_ONLY': #only the RTed username is pulled from each RT
for i,j in enumerate(tweets):
ts = j.split('rt @')[1]
try:
rted = ts[:re.search('[^a-z0-9_]',ts).start()]
except AttributeError:
rted = ts
if len(authors[i]) >= 1 and authors[i] != rted: #prevents ppl from manually RTing themselves
final.append([authors[i],rted])
else: #the code below is necessary to pull multiple mentioned users from single tweets
tweets = [t.split('@') for t in tweets] #splits each tweet along @s
for n,chunk in enumerate(tweets):
ment_users = [t[:re.search('[^A-Za-z0-9_]',t).start()].lower().strip() for t in chunk if re.search('[^A-Za-z0-9_]',t) is not None]
final.extend([[authors[n],name] for name in ment_users if len(name.strip()) > 0])
print('Edge list created.')
if len(save_prefix) > 0:
outfile = save_prefix + '_edgelist.csv'
save_csv(outfile,final)
return final
# get_top_communities: Get top k communities by membership
# Description: This function runs the Louvain method for community detection on an edgelist and returns the names within each of the top k detected communities, the community to which each name belongs, and each name's in-degree. It's basically a wrapper for Thomas Aynaud's excellent Python implementation of Louvain (original version here: http://perso.crans.org/aynaud/communities/) with a few upgrades I found useful. See Blondel, V. D., Guillaume, J. L., Lambiotte, R., & Lefebvre, E. (2008). Fast unfolding of communities in large networks. Journal of Statistical Mechanics: Theory and Experiment, 2008(10), P10008.
# Arguments:
# edges_data: An edgelist of the type exported by t2e. Can be a list of lists or a path to a CSV file.
# top_comm: This variable can either be an integer or a decimal (float) between 0 and 1. If an integer, it represents the top k communities by node population to be analyzed. If a decimal, it represents the top (k*100)% of communities by population to be analyzed. These will be the communities which this module's functions will manipulate. For large Twitter networks, I have found it fruitful to work with the top ten largest retweet or @-mention communities. The higher this integer or decimal, the longer TSM will take to process your data. Enter 1.0 to analyze all communities. Default is 10.
# randomize: If this variable is set to True, the edgelist will be randomized before running the rest of the function. This will produce slightly different results on each run. If the variable is set to False, the edgelist will not be randomized and the results will always be the same. Default is True.
# prominence_metric: The network metric by which nodes will be ranked in descending order in the nodes_list of your louvainObject. This variable may be assigned any per-node metric available in NetworkX (see http://networkx.github.io/documentation/networkx-1.9.1/ ). NetworkX's methods need to be written as strings (e.g. 'in_degree'); functions need to be written with the appropriate module prefix(es) and without quotes (e.g. tsm.nx.eigenvector_centrality). Default is 'in_degree'.
# save_prefix: Add a string here to save your file to CSV. Your saved file will be named as follows: 'string'_communities.csv
# Output: An object of the custom class 'louvainObject' with the following attributes:
# node_list: A list of lists containing each unique node in the largest communities as defined above by the top_comm variable, the ID of the community to which it belongs, and its in-degree.
# n_nodes: A dict in which the keys are community IDs and the values are integers representing the number of nodes belonging to each community
# n_communities: An integer representing the total number of communities detected by the algorithm.
# modularity: A float representing the network's modularity.
# node_propor: A float representing the proportion of all nodes included within the largest communities.
# edge_propor: A float representing the proportion of all edges included within the largest communities.
class louvainObject:
'''an object class with attributes for various Louvain-related data and metadata'''
def __init__(self,node_list,n_nodes,n_communities,modularity,node_propor,edge_propor):
self.node_list = node_list
self.n_nodes = n_nodes
self.n_communities = n_communities
self.modularity = modularity
self.node_propor = node_propor
self.edge_propor = edge_propor
def get_top_communities(edges_data,
top_comm=10,
randomize=True,
prominence_metric='in_degree',
save_prefix=''):
edge_list = load_data(edges_data)
if randomize == True:
random.shuffle(edge_list)
non_dir = nx.Graph()
non_dir.add_edges_from(edge_list)
print("Non-directed network created.")
allmods = community.best_partition(non_dir)
print("Community partition complete.")
uniqmods = {}
for i in allmods: #creates a dict of unique communities and the n of times they occur
if allmods[i] in uniqmods:
uniqmods[allmods[i]] += 1
else:
uniqmods[allmods[i]] = 1
n_communities = len(uniqmods)
if (top_comm > 0 and top_comm < 1) or top_comm == 1.0:
top_comm = int(round(n_communities * top_comm,0))
top_n = sorted(uniqmods,key=uniqmods.get,reverse=True)[0:top_comm] #gets a list of the top k communities by node count as defined by the top_comm variable
filtered_nodes = {}
for i in allmods: #creates a dict containing only handles belonging to one of the top k communities
if allmods[i] in top_n:
filtered_nodes[i] = allmods[i]
top_edge_list = [i for i in edge_list if i[0] in filtered_nodes and i[1] in filtered_nodes]
di_net = nx.DiGraph()
di_net.add_edges_from(top_edge_list)
try:
ind = getattr(di_net,prominence_metric)()
except (AttributeError,TypeError):
ind = prominence_metric(di_net)
outlist = []
for i in filtered_nodes:
outlist.append([ind[i],i,str(filtered_nodes[i]),str(ind[i])])
outlist.sort(reverse=True)
for i in outlist:
del i[0]
mod = round(community.modularity(allmods,non_dir),2)
node_propor = round((len(filtered_nodes)/len(allmods))*100,2)
edge_propor = round((len(top_edge_list)/len(edge_list))*100,2)
n_nodes = {}
for i in outlist:
if i[1] in n_nodes:
n_nodes[i[1]] += 1
else:
n_nodes[i[1]] = 1
print("Total n of communities:",n_communities)
if n_communities < top_comm:
top_comm = n_communities #in case there are fewer detected communities than specified in top_comm
print("Modularity:",mod)
print("Community analysis complete. The top",top_comm,"communities in this network account for",node_propor,"% of all nodes.")
print("And",edge_propor,"% of all edges.")
if len(save_prefix)>0:
if type(prominence_metric) is str:
prominence_header = prominence_metric
else:
prominence_header = prominence_metric.__name__
outlist.insert(0,['name','community',prominence_header])
outfile = save_prefix + '_communities.csv'
save_csv(outfile,outlist)
return louvainObject(outlist,n_nodes,n_communities,mod,node_propor,edge_propor)
# calc_ei: Calculate EI index for top k communities
# Description: This function calculates Krackhardt & Stern's EI index for each community represented in a file or variable output by get_top_communities. The EI index ranges between 1 and -1, with 1 indicating that all the community's ties are with outsiders, -1 indicating they are all with members, and 0 indicating equal numbers of ties with members and outsiders. See Krackhardt, D., & Stern, R. N. (1988). Informal networks and organizational crises: An experimental simulation. Social psychology quarterly, 123-140.
# Arguments:
# nodes_data: A community-partition dataset of the type exported by get_top_communities. Can be a variable (a list of lists) or a path to a CSV file.
# edges_data: An edgelist of the type exported by t2e. Can be a variable (a list of lists) or a path to a CSV file.
# all_output: If set to True, the function _get_shared_ties will execute after calc_ei has finished. If set to False, _get_shared_ties will not execute. Default is True.
# pause: If set to True, the function will pause and wait for a key strike before displaying the results of _get_shared_ties. If set to False, it will not pause. Default is True.
# weight_edges: If set to True, the function will include duplicate edges in the EI calculations. If set to False, the edgelist will be unweighted--in other words all duplicate edges will be removed. For example, if the userA->userB edge has a weight of 5 (meaning A linked to B five distinct times), the function will count that as a single unweighted tie. Default is True.
# verbose: If set to True, calc_ei will print some of its output to the shell prompt. If set to False, this output will be suppressed. Default is False.
# save_prefix: Add a string here to save your file to CSV. Your saved file will be named as follows: 'string'_communities.csv
# Output: An object of the custom class "eiObject" containing the following attributes:
# n_nodes: An OrderedDict in which the keys are community IDs and the values are integers representing the number of nodes belonging to each community
# index: An OrderedDict in which the keys are community IDs and the values are corresponding EI indices.
# internal_ties: An OrderedDict in which the keys are community IDs and the values are counts of internal edges.
# external_ties: An OrderedDict in which the keys are community IDs and the values are counts of external edges.
# mean_ei: The mean of the EI indices from index.
# If the all_output flag is set to True, the returned eiObject will also include the following optional attributes:
# total_ties: An OrderedDict in which the keys are community IDs and the values are total counts of all ties involving at least one community member.
# received_ties: An OrderedDict in which each key is a community ID and each value is a count of all ties wherein the recipient is a community member.
# sent_ties: An OrderedDict in which each key is a community ID and each value is a count of all ties wherein the sender is a community member.
# r_s: An OrderedDict in which each key is a community ID and each value is the received count minus the sent count.
# adj_in: An OrderedDict of dicts in which each key is a community ID (A), each second-level key is a community ID (B), and each second-level value is the number of edges originating in B and pointing to A.
# adj_out: An OrderedDict of dicts in which each key is a community ID (A), each second-level key is a community ID (B), and each second-level value is the number of edges originating in A and pointing to B.
class eiObject:
'''an object class with attributes for various EI-index-related data and metadata'''
def __init__(self,n_nodes=None,ei_indices=None,internal_ties=None,external_ties=None,mean_ei=None,total_ties=None,received_ties=None,sent_ties=None,r_s=None,adj_in=None,adj_out=None):
self.n_nodes = n_nodes
self.ei_indices = ei_indices
self.internal_ties = internal_ties
self.external_ties = external_ties
self.mean_ei = mean_ei
self.total_ties = total_ties
self.received_ties = received_ties
self.sent_ties = sent_ties
self.r_s = r_s
self.adj_out = adj_out
self.adj_in = adj_in
def calc_ei(nodes_data,edges_data,all_output=True,pause=False,weight_edges=True,verbose=False,save_prefix=''):
if weight_edges == False:
print("Calculating EI indices using *UNweighted* edges.\n")
else:
print("Calculating EI indices using *weighted* edges.\n")
nodes = load_data(nodes_data)
if nodes[0][0] == 'name':
del nodes[0] #remove headers from CSV
edges = load_data(edges_data)
modclass = [i[1] for i in nodes]
moduniq = {}
for i in modclass: #get and count unique community IDs
if i in moduniq:
moduniq[i] += 1
else:
moduniq[i] = 1
mu_top = sorted(moduniq,key=moduniq.get,reverse=True) #get all communities from node file
top_nodes_list = [i for i in nodes if i[1] in mu_top]
top_nodes = {node[0]:node[1] for node in top_nodes_list} #create dict of screen names and community IDs
if weight_edges == False:
edges = list(set([i[0] + "," + i[1] for i in edges])) #unweight the edgelist--multiple links to B from A count as one edge
edges = [i.split(",") for i in edges]
for edge in edges:
if edge[0] in top_nodes:
edge.append(top_nodes[edge[0]])
for edge in edges:
if edge[1] in top_nodes:
edge.append(top_nodes[edge[1]])
top_edges = [i for i in edges if len(i) == 4]
ei_int = {}
ei_ext = {}
for i in mu_top:
ei_int[i] = 0
ei_ext[i] = 0
for j in top_edges:
if j[2] == i and j[3] == i:
ei_int[i] += 1
elif j[2] == i or j[3] == i:
ei_ext[i] += 1
ei_indices = {}
for i in ei_int:
ei_indices[i] = round((ei_ext[i]-ei_int[i])/(ei_ext[i]+ei_int[i]),3)
ei_ord = collections.OrderedDict(sorted(ei_indices.items()))
ei_int = collections.OrderedDict(sorted(ei_int.items()))
ei_ext = collections.OrderedDict(sorted(ei_ext.items()))
mean_ei = round(sum(ei_indices.values())/len(ei_indices.values()),3)
if verbose == True:
print("***EI indices***\n")
print("Community\tEI index")
for i in ei_ord:
print(str(i)+"\t"+str(ei_ord[i]))
print("Mean EI:\t",mean_ei)
n_nodes = {}
for i in nodes:
if i[1] in n_nodes:
n_nodes[i[1]] += 1
else:
n_nodes[i[1]] = 1
if pause == True:
input('Press any key to continue...')
if all_output == True:
ei_out = _get_shared_ties(mu_top,top_edges,ei_int,ei_ext,n_nodes,verbose)
else:
ei_out = eiObject()
if len(save_prefix) > 0:
ei_list = []
for i in ei_ord:
ei_list.append([str(i),str(ei_ord[i])])
ei_list.insert(0,['Community IDs','EI indices'])
save_csv(save_prefix + '_ei_indices.csv',ei_list)
print("\n")
ei_out.n_nodes = collections.OrderedDict(sorted(n_nodes.items()))
ei_out.ei_indices = ei_ord
ei_out.internal_ties = ei_int
ei_out.external_ties = ei_ext
ei_out.mean_ei = mean_ei
return ei_out
# _get_shared_ties: Obtains numbers of shared ties between each community and all others
# Description: This function reveals how a given community's "external" edges are distributed among the other communities. It is not a standalone function: it can only be run by using the "PROX" or "PROX_PAUSE" option from calc_ei. So don't try to enter the following arguments into the function yourself unless you know what you're doing.
# Arguments:
# top_community_ids: A list of the top k communities by membership.
# top_edges: A list of all edges both of whose nodes belong to one of the top k communities.
# ei_int: A dict in which each key is one of the top k community IDs and each value is the number of edges in which both nodes are members of that community.
# ei_ext: A dict in which each key is one of the top k community IDs and each value is the number of edges in which one node is a member of that community and the other is a member of any other community.
# n_nodes: A dict in which each key is one of the top k community IDs and each value is the total number of nodes in that community.
# verbose: If set to True, _get_shared_ties will print some of its output to the shell prompt. If set to False, this output will be suppressed. The value of this variable is inherited from calc_ei, where its default value is False.
# Output: The optional output attributes for the "eiObject" class (see above).
def _get_shared_ties(top_community_ids,top_edges,ei_int,ei_ext,n_nodes,verbose):
adj_out = {} #sent ties point away from the focal community
adj_in = {} #received ties point toward the focal community
for i in top_community_ids:
adj_out[i] = {}
adj_in[i] = {}
for j in top_edges:
if j[2] == i and j[3] != i:
if j[3] in adj_out[i]:
adj_out[i][j[3]] += 1
else:
adj_out[i][j[3]] = 1
if j[2] != i and j[3] == i:
if j[2] in adj_in[i]:
adj_in[i][j[2]] += 1
else:
adj_in[i][j[2]] = 1
total_dict = {}
received_dict = {}
sent_dict = {}
r_s_dict = {}
for i in top_community_ids:
total = ei_int[i]+ei_ext[i]
total_dict[i] = total
incoming = sum(adj_in[i].values())
received_dict[i] = incoming
outgoing = sum(adj_out[i].values())
sent_dict[i] = outgoing
r_s = round(incoming - outgoing,3)
r_s_dict[i] = r_s
if verbose == True:
print("\nCommunity\tSize\tInternal\tReceived\tSent\tReceived - Sent")
print(str(i)+"\t"+str(n_nodes[i])+"\t"+str(round(ei_int[i]/total,3))+"\t"+str(round(incoming/total,3))+"\t"+str(round(outgoing/total,3))+"\t"+str(round(r_s/total,3))+"\n")
for j in top_community_ids:
if i != j:
if j in adj_out[i]:
print(str(j)+"-s\t"+str(round(adj_out[i][j]/total,3)))
if j in adj_in[i]:
print(str(j)+"-r\t"+str(round(adj_in[i][j]/total,3)))
prox_out = eiObject()
prox_out.adj_in = collections.OrderedDict(sorted(adj_in.items()))
prox_out.adj_out = collections.OrderedDict(sorted(adj_out.items()))
prox_out.total_ties = collections.OrderedDict(sorted(total_dict.items()))
prox_out.received_ties = collections.OrderedDict(sorted(received_dict.items()))
prox_out.sent_ties = collections.OrderedDict(sorted(sent_dict.items()))
prox_out.r_s = collections.OrderedDict(sorted(r_s_dict.items()))
return prox_out
# get_top_rts: Gets the most-retweeted tweets in a Twitter dataset with community IDs
# Description: This function returns a list of the most-retweeted tweets along with the community IDs of the tweet authors and retweet counts. This allows researchers to easily view the most-retweeted tweets within each community.
# Arguments:
# tweets_file: A CSV file containing tweets formatted for t2e as specified above.
# nodes_data: A community-partition dataset of the type exported by get_top_communities. Can be a variable (a list of lists) or a path to a CSV file.
# min_rts: An integer indicating the minimum number of retweets to be included in the output. Default is 5. Increasing this number will reduce your filesize and processing time; decreasing it will do the opposite.
# lc: A boolean value determining whether the retweets will be converted to lowercase before counting duplicates. Lowercasing retweets may increase retweet counts but it will break case-sensitive hyperlinks such as those generated by Twitter. Default is False.
# enc: the character encoding of the file you're trying to open and/or save. See https://docs.python.org/3.4/library/codecs.html#standard-encodings
# save_prefix: Add a string here to save your file to CSV. Your saved file will be named as follows: 'string'_top_RTs.csv
# Output: A list of lists, each of which contains the name of the retweeted user, the full text of the retweet, the user's community ID, and the number of times the tweet was retweeted. This list is ranked in descending order of retweet count.
def get_top_rts(tweets_file,nodes_data='',tweet_index=1,min_rts=5,lc=False,enc='utf-8',save_prefix=''):
rts = []
if nodes_data != '':
node_dict = {i[0]:i[1] for i in load_data(nodes_data)}
else:
node_dict = {i[0].lower():'' for i in load_data(tweets_file)}
with open(tweets_file,'r',encoding=enc,errors='replace') as f:
reader = csv.reader(f)
for row in reader:
if row[tweet_index].startswith('RT @') and row[tweet_index].find(':')>-1:
if lc == False:
rts.append(row[tweet_index])
else:
rts.append(row[tweet_index].lower())
rts_ct = collections.Counter(rts).most_common()
rts_ct_out = []
for n,i in enumerate(rts_ct):
rted = i[0][i[0].find('@')+1:i[0].find(':')].lower()
if nodes_data == '' and rted not in node_dict:
node_dict[rted] = ''
try:
rts_ct_out.append([rted,rts_ct[n][0],node_dict[rted],rts_ct[n][1]])
except KeyError:
pass
rts_ct_out = [i for i in rts_ct_out if i[3] >= min_rts]
if len(save_prefix) > 0:
rts_ct_out.insert(0,['rted_user','rt_text','community','n_rts'])
out_fn = save_prefix + '_top_RTs.csv'
save_csv(out_fn,rts_ct_out,True)
return rts_ct_out[1:]
else:
return rts_ct_out
# match_communities: Find the best community matches between two networks using the weighted Jaccard coefficient
# Description: This function takes two partitioned networks, A and B, and finds the best match for each community in A among the communities in B. Matches are determined by measuring membership overlap with either the weighted or the unweighted Jaccard coefficient, depending on how the weight_nodes parameter is set. To reduce processing time, only the top (propor * 100)% of nodes by in-degree in each community are compared. The weighted Jaccard comparisons are weighted by in-degree, meaning that higher in-degree nodes count more toward community similarity. This is based on the assumption that nodes of higher in-degree play a proportionately larger role in terms of maintaining community coherence.
# Arguments:
# nodes_data_A: A community-partition dataset of the type exported by get_top_communities (network A). Can be a variable or a path to a CSV file.
# nodes_data_B: A community-partition dataset of the type exported by get_top_communities (network B). Can be a variable or a path to a CSV file.
# nodes_propor: A float variable greater than 0 and less than 1 representing the per-community proportion of top in-degree nodes to compare between A and B. Default is 0.01 (1%). Increasing this number will increase processing time.
# jacc_threshold: A float variable greater than 0 and less than 1 representing the Jaccard value above which community pairs will be considered valid matches. Default is 0.3. I would caution against using this blindly--try a few different values and see what seems to make sense for your data.
# dc_threshold: A float variable representing the Jaccard value above which convergences and divergences (as defined below) will be considered valid. I suggest setting this somewhat lower than jacc_threshold.
# weight_edges: If set to True, the function will weight the Jaccard coefficient by each node's in-degree to match communities between networks A and B. If set to False, it will use the unweighted Jaccard coefficient. Default is True.
# verbose: If set to True, match_communities will print some of its output to the shell prompt. If set to False, this output will be suppressed. Default value is False.
# Output: An object of the custom class "cMatchObject" containing the following attributes:
# best_match: An OrderedDict in which the keys are the best community matches for network A in network B (in AxB format where A and B are community IDs) and the values are the corresponding Jaccard values.
# shared_node: An OrderedDict in which the keys are the best matches for network A in network B (in AxB format where A and B are community IDs) and the values comprise the set of top nodes shared between the two networks.
# nonzero_jacc: An OrderedDict of dicts in which each first-order key is a community ID from network A, each second-order key is a community ID from network B, and each second-order value is the nonzero Jaccard between the A and B IDs.
# diverge: An OrderedDict in which each key is a community ID from network A and each value is a list containing two or more IDs from network B. These IDs represent divergences, i.e. situations in which two or more month-B communities share Jaccard values exceeding jacc_threshold with a single month-A community.
# converge: An OrderedDict in which each key is a community ID from network B and each value is a list containing two or more IDs from network A. These IDs represent convergences--the logical converses of divergences--i.e. situations in which two or more month-A communities share Jaccard values exceeding jacc_threshold with a single month-B community.
class cMatchObject:
'''an object class with attributes for various matched-community data and metadata'''
def __init__(self,best_matches=None,shared_nodes=None,nonzero_jaccs=None,divergences=None,convergences=None):
self.best_matches = best_matches
self.shared_nodes = shared_nodes
self.nonzero_jaccs = nonzero_jaccs
self.divergences = divergences
self.convergences = convergences
def match_communities(nodes_data_A,nodes_data_B,nodes_filter=0.01,jacc_threshold=0.3,dc_threshold=0.2,weight_edges=True,verbose=False):
filtered_nodes_1 = {}
filtered_nodes_2 = {}
# If data already filtered, assign the vars
if (type(nodes_data_A) is dict) and (type(nodes_data_B) is dict):
filtered_nodes_1 = nodes_data_A
filtered_nodes_2 = nodes_data_B
# Else assume it is a CSV file and load, then assign via _filter_nodes()
else:
nodesA = load_data(nodes_data_A)
if nodesA[0][0] == 'name':
del nodesA[0]
nodesB = load_data(nodes_data_B)
if nodesB[0][0] == 'name':
del nodesB[0]
filtered_nodes_1 = _filter_nodes(nodesA,nodes_filter)
filtered_nodes_2 = _filter_nodes(nodesB,nodes_filter)
hijacc = 0
best_match = {}
nonzero_jacc = {}
for i in filtered_nodes_1:
nonzero_jacc[i] = {}
hix = i+'x'
for j in filtered_nodes_2:
intersect = set(filtered_nodes_1[i]).intersection(set(filtered_nodes_2[j])) #get intersection of names for month 1 + 2
union_both = set(filtered_nodes_1[i]).union(set(filtered_nodes_2[j]))
if weight_edges == True:
inter_weights = [int(k[2]) for k in nodesA if k[0] in intersect] + [int(k[2]) for k in nodesB if k[0] in intersect] #pull intersection in-degrees from both months and combine into a single list
union_weights = [int(k[2]) for k in nodesA if k[0] in union_both] + [int(k[2]) for k in nodesB if k[0] in union_both] #pull union in-degrees from both months and combine into a single list
try:
jacc = sum(inter_weights)/sum(union_weights)
except ZeroDivisionError:
jacc = 0
else: #if weight_edges is set to anything other than 'WEIGHT_JACCARD', set all weights to 1 for each node
inter_weights = len(intersect)
union_weights = len(union_both)
try:
jacc = inter_weights/union_weights
except ZeroDivisionError:
jacc = 0
if jacc > 0:
if verbose == True:
print(i+'x'+j+"\t"+str(round(jacc,4)))
nonzero_jacc[i][j] = round(jacc,4) #
if jacc > hijacc:
hijacc = round(jacc,4)
hix = i+'x'+j
if verbose == True:
print('high:'+"\t"+hix+"\t"+str(hijacc)+"\n")
best_match[hix] = hijacc
hijacc = 0
shared_node = {}
if verbose == True:
print('Top community matches:')
for i in best_match:
if best_match[i] >= jacc_threshold:
nodes_A = set(filtered_nodes_1[i[:i.find('x')]])
nodes_B = set(filtered_nodes_2[i[i.find('x')+1:]])
shared_node[i] = nodes_A.intersection(nodes_B)
sn = ', '.join(shared_node[i])
if verbose == True:
print(i,"\t",best_match[i],"\t",sn,"\n")
diverge = {}
for i in nonzero_jacc: #divergence test
if len(nonzero_jacc[i]) > 0:
jacc_tmp = sorted(nonzero_jacc[i].values(),reverse=True)
highest_key = sorted(nonzero_jacc[i],key=nonzero_jacc[i].get,reverse=True)[0]
for j in nonzero_jacc[i]:
if highest_key != j and jacc_tmp[0] >= dc_threshold and nonzero_jacc[i][j] >= dc_threshold:
if verbose == True:
print('Divergence of month-A community',i,'into month-B communities',highest_key,'and',j)
diverge[i] = [highest_key,j]
thres_jacc = {} #convergence test
for i in nonzero_jacc: #get all jaccs above dc_threshold
thres_jacc[i] = {}
for j in nonzero_jacc[i]:
if nonzero_jacc[i][j] >= dc_threshold:
thres_jacc[i][j] = nonzero_jacc[i][j]
thres_dict_list = {} #get a dict of lists of the month-B IDs above jacc_threshold
for i in thres_jacc:
thres_dict_list[i] = list(thres_jacc[i].keys())
conv = {} #create a dict of empty lists corresponding to all unique month-B IDs
for i in thres_dict_list:
for j in thres_dict_list[i]:
conv[j] = []
for i in thres_dict_list: #figure out which month-B IDs appear as multiple matches for month-A IDs
for j in thres_dict_list:
if i != j:
c_intersect = set(thres_dict_list[i]).intersection(set(thres_dict_list[j]))
if len(c_intersect) > 0:
conv_msg = 0
for k in c_intersect:
if i not in conv[k]:
conv_msg += 1
conv[k].append(i)
if j not in conv[k]:
conv_msg += 1
conv[k].append(j)
if conv_msg == 2:
if verbose == True:
print('Convergence of month-A communities',i,'and',j,'into month-B community',k)
converge = {}
for i in conv:
if len(conv[i]) > 0:
converge[i] = conv[i]
match_out = cMatchObject()
match_out.best_matches = collections.OrderedDict(sorted(best_match.items()))
match_out.shared_nodes = collections.OrderedDict(sorted(shared_node.items()))
match_out.nonzero_jaccs = collections.OrderedDict(sorted(nonzero_jacc.items()))
match_out.divergences = collections.OrderedDict(sorted(diverge.items()))
match_out.convergences = collections.OrderedDict(sorted(converge.items()))
return match_out
# _filter_nodes: Get the nodes of highest in-degree in a network OR the nodes in a fixed list that appear in a network
# Desciption: This is a helper function for match_communities and get_intermediaries that simply loads the top (propor * 100)% of nodes by in-degree OR a preset list of nodes in each community in a partitioned network into a list.
# Arguments:
# nodes_data: A community-partition dataset of the type exported by get_top_communities.
# propor: This variable can either be a float greater than 0 and less than 1 OR a list of node names. If a float, the variable represents the proportion of top in-degree nodes to extract from each community. If a list of nodes, it represents the specific set of nodes to extract from nodes_data when present. Default is 0.01 (1%). Increasing the float will increase processing time.
#Output: A dict whose keys are community IDs and whose values are filtered lists of node names.
def _filter_nodes(nodes_data,nodes_filter=0.01):
uniq_cl = set([i[1] for i in nodes_data]) #creates a unique list of the top 10 communities from each community file
filtered_nodes = {} #creates a dict of lists. Each list contains the top [propor*100]% most-connected nodes within each community
if type(nodes_filter) is float:
for i in uniq_cl:
cltemp = [j[0] for j in nodes_data if j[1] == i]
pct = int(len(cltemp) * nodes_filter)
filtered_nodes[i] = [j for j in cltemp][0:pct]
else:
nf2 = [[j[0],j[1]] for j in nodes_data if j[0] in nodes_filter]
for i in uniq_cl:
filtered_nodes[i] = [j[0] for j in nf2 if j[1] == i]
return filtered_nodes
# get_intermediaries: Identifies nodes who are heavily connected to by multiple network communities
# Description: When analyzing partitioned networks, it is sometimes helpful to know not only which nodes are high in betweenness centrality, but also which communities are bridged by such nodes. This function identifies high in-degree nodes whose ties are relatively evenly distributed across at least two communities.
# Arguments:
# nodes_data: A community-partition dataset of the type exported by get_top_communities.
# edges_data: An edgelist of the type exported by t2e.
# threshold: A float variable greater than 0 and less than 1 representing the minimum proportion of internal ties a given top node needs to receive from an external community to count as a bridge. For example, for node DF where the community most connected to DF is A, setting the threshold to 0.5 means that for DF to count as a bridge, the number of ties DF receives from second most-connected community B must equal at least 50% of the ties it receives from A.
# nodes_filter: This variable can be either a float greater than 0 and less than 1 or a list of node names. If the former, it represents the proportion of top in-degree nodes to extract from each community. If the latter, it represents the collection of nodes to search for in each community. Default is 0.01 (1%). Increasing the float or list size will increase processing time.
# verbose: If set to True, the shell will print a message every time a new node is added to the bridge list. Default is False.
# zeropad: If zeropad is set to False, if a node from Community A receives no edges from Community B, get_intermediaries will omit community B from that node's dict of received ties. If zeropad is set to True, for each community like B, get_intermediaries will create a new dict item whose value is 0 (whereas otherwise that dict item would simply not exist).
# Output: A list of lists, each of which contains a bridge node's in-degree (at index 0), its name (at index 1), and a dict in which each key is a community ID and each value is the N of ties the node received from that community (at index 2). Note: the community ID of the bridge node is not explicitly indicated in this dict, but it is almost always the ID with the highest N of received ties.
def get_intermediaries(nodes_data,edges_data,bridge_threshold=0.5,nodes_filter=0.01,verbose=False,zeropad=True):
nodes = load_data(nodes_data)
if nodes_data[0][0] == 'name':
del nodes[0]
edges = load_data(edges_data)
filtered_nodes = _filter_nodes(nodes,nodes_filter)
total_nodes = sum([len(filtered_nodes[i]) for i in filtered_nodes])
name_ct = 0
cmty_list = list(filtered_nodes.keys())
node_dict = {i[0]:i[1] for i in nodes} #create dict of node names and community IDs
bridge_cands = {}
for n,cmty in enumerate(filtered_nodes):
for name in filtered_nodes[cmty]:
if verbose == True:
name_ct += 1
print('Analyzing node "' + name + '" (' + str(name_ct) + ' of ' + str(total_nodes) + ' total).')
user_edges = []
for i in edges: #pull all edges of which node is the recipient
if name == i[1]:
user_edges.append(i)
ue_minus = [i for i in user_edges if i[0] in node_dict] #remove all nodes not in the top k communities
cmty_rts = {}
for i in cmty_list:
for j in ue_minus:
if node_dict[j[0]] == i:
if i in cmty_rts:
cmty_rts[i] += 1
else:
cmty_rts[i] = 1
list_rts_ct = sorted(list(cmty_rts.values()),reverse=True)
if bridge_threshold > 0:
add_bool = len(cmty_rts) >= 2 and list_rts_ct[1] >= list_rts_ct[0]*bridge_threshold #the N of ties to the 2nd-highest community must equal or exceed a minimum proportion of the N of ties to the highest community
elif len(list_rts_ct) > 0:
add_bool = True
else:
add_bool = False
if add_bool is True:
if verbose == True:
print('Node "' + name + '" added to the list.')
cmty_rts = collections.OrderedDict(sorted(cmty_rts.items(),key=operator.itemgetter(1),reverse=True))
bridge_cands[name] = cmty_rts
bridge_list = []
for i in bridge_cands:
bridge_list.append([sum(bridge_cands[i].values()),i,bridge_cands[i]])
bridge_list = sorted(bridge_list,reverse=True)
if zeropad == True:
for i in bridge_list:
if len(i[2]) < len(cmty_list):
omitted = [j for j in cmty_list if j not in i[2]]
for k in omitted:
i[2][k] = 0
return bridge_list
# get_top_hashtags: Collects the most-used hashtags in each community in descending order of popularity
# Description: This function collects the most-used hashtags in a set of tweets that's been partitioned into communities and organizes them first by community and then in descending order of popularity.
# Arguments:
# tweets_data:
# IF nodes_data IS NONBLANK: a path to a CSV file (the only delimiter currently allowed is commas) with tweet authors listed in col 1 and corresponding tweet text in col 2, or an equivalent k x 2 list of lists. . If col 1 contains any text, col 2 must as well, and vice versa.
# IF nodes_data IS BLANK: a path to a CSV file (the only delimiter currently allowed is commas) with tweet text in col 1, which should be the sole column.
# nodes_data: A community-partition dataset of the type exported by get_top_communities. Can be left blank per the explanation above.
# min_ct: The minimum number of times a hashtag must appear in a given community to be included in that community's list. Increasing this number speeds processing. Default is 10.
# rtl_ht: If set to True, the function will search for hashtags written with the hashmark on the right, such as those in right-to-left languages like Arabic and Hebrew. If set to False, it will not include such hashtags. Default is False.
# Output:
# IF nodes_data IS NONBLANK: A dict whose keys are community IDs and whose values are lists, the values of which are tuples in which the first value is a hashtag and the second is the number of times it appears within the given community. Each list is arranged in descending order of hashtag prevalence.
# IF nodes_data IS BLANK: a list whose values are tuples in which the first value is a hashtag and the second is the number of times it appears within the given community. This list is arranged in descending order of hashtag prevalence.
def get_top_hashtags(tweets_data,nodes_data='',min_ct=10):
tweets = [i for i in load_data(tweets_data) if len(i) > 0 and i[0].strip() != '']
if nodes_data != '':
nodes = [i for i in load_data(nodes_data) if i[0].strip() != '']
if nodes_data[0][0] == 'name':
del nodes[0]
clust_uniq = set([i[1] for i in nodes])
node_dict = {i[0].lower():i[1] for i in nodes}
tweets = tuple([[i[0].lower(),i[1].lower()] for i in tweets if i[0].lower() in node_dict])
else:
if type(tweets_data[0]) is list and len(tweets_data[0]) >= 2:
tweets = [i[1] for i in tweets if len(i[1]) > 0]
elif type(tweets_data[0]) is list:
tweets = [i[0] for i in tweets if len(i[0]) > 0]
else:
tweets = tweets_data
clust_uniq = set('1')
tweets = tuple([i.lower() for i in tweets])
ht_dict = {}
for cid in clust_uniq:
if nodes_data != '':
splitprep = [t[1].replace(u'\u200F','') for t in tweets if '#' in t[1] and node_dict[t[0]] == cid] #fills in the list tweets with hashtags, lowercased, space-padded, cleaned and only if a hashmark exists in the tweet
else:
splitprep = tuple([t.replace(u'\u200F','') for t in tweets if '#' in t])
final = []
hts = [re.findall('#\w+',t) for t in splitprep]
for h in hts:
final.extend(set(h))
ht_dict[cid] = tuple(final)
for i in ht_dict:
ht_dict[i] = tuple([j for j in collections.Counter(ht_dict[i]).most_common() if j[1] >= min_ct])
if nodes_data != '':
return {i:ht_dict[i] for i in ht_dict if len(ht_dict[i]) > 0}
else:
return [i for i in ht_dict['1'] if len(ht_dict['1']) > 0 and i[1] >= min_ct]
# get_top_links: Collects the most-used hyperlinks or web domains in each community in descending order of popularity
# Description: This function collects the most-used hyperlinks or web domains in a set of tweets that's been partitioned into communities and organizes them first by community and then in descending order of popularity.
# Arguments:
# tweets_data:
# IF nodes_data IS NONBLANK: a path to a CSV file (the only delimiter currently allowed is commas) with tweet authors listed in col 1 and corresponding tweet text in col 2, or an equivalent k x 2 list of lists. If col 1 contains any text, col 2 must as well, and vice versa.
# IF nodes_data IS BLANK: a path to a CSV file (the only delimiter currently allowed is commas) with tweet text in col 1, which should be the sole column.
# nodes_data: A community-partition dataset of the type exported by get_top_communities. Can be left blank per the explanation above.
# min_ct: The minimum number of times a hyperlink or domain must appear in a given community to be included in that community's list. Increasing this number speeds processing. Default is 10.
# domains_only: If set to True, get_top_links will extract only web domains (e.g. all articles from the New York Times will be counted under the nytimes.com domain). If set to False, it will extract full links and count distinct links with the same domain separately. Default is False.
# remove_3ld: If set to True, the function will remove all third-level domains from the links (e.g. "www."). If set to False, it will leave all third-level domains intact. Default is False.
# remove_domains: If a list of web domains is entered, these domains will be removed from the data prior to processing. Default is an empty list, which leaves all domains intact. Entering anything other than a list of strings may result in an error.
# Output:
# IF nodes_data IS NONBLANK: A dict whose keys are community IDs and whose values are lists, the values of which are tuples in which the first value is a link or domain and the second is the number of times it appears within the given community. Each list is arranged in descending order of hashtag prevalence.
# IF nodes_data IS BLANK: a list whose values are tuples in which the first value is a link or domain and the second is the number of times it appears within the given community. This list is arranged in descending order of link/domain prevalence.
def get_top_links(tweets_data,nodes_data='',min_ct=10,domains_only=False,remove_3ld=False,remove_trailing_chars=['#','?'],exclude_domains=[]):
tweets = [i for i in load_data(tweets_data) if len(i) > 0 and i[0].strip() != '']
if nodes_data != '':
nodes = [i for i in load_data(nodes_data) if i[0].strip() != '']
if nodes_data[0][0] == 'name':
del nodes[0]
clust_uniq = set([i[1] for i in nodes])
node_dict = {i[0].lower():i[1] for i in nodes}
tweets = tuple([[i[0].lower(),i[1].replace('https','http')] for i in tweets if i[0].lower() in node_dict])
else:
if type(tweets_data[0]) is list and len(tweets_data[0]) >= 2:
tweets = [i[1] for i in tweets if len(i[1]) > 0]
elif type(tweets_data[0]) is list:
tweets = [i[0] for i in tweets if len(i[0]) > 0]
else:
tweets = tweets_data
clust_uniq = set('1')
tweets = tuple([i.replace('https','http') for i in tweets])
links_dict = {}
domains_only_regex = r'(?:http)(?:s?)(?:://)(.+?)(?:/|\s|$)'
standard_regex = r'(?:http)(?:s?)(?:://)(.+?)(?:\s|$)'
for cid in clust_uniq:
if nodes_data != '':
splitprep = [t[1].replace(u'\u200F','') for t in tweets if 'http://' in t[1] and node_dict[t[0]] == cid] #fills in the list tweets with hyperlinks, lowercased, space-padded, cleaned and only if 'http://' exists in the tweet
else:
splitprep = [t.replace(u'\u200F','') for t in tweets if 'http://' in t and '.' in t]
final = []
if domains_only == True:
links = [re.findall(domains_only_regex,t) for t in splitprep]
else:
links = [re.findall(standard_regex,t) for t in splitprep]
for u in links:
final.extend(set(u))
if len(exclude_domains) > 0:
final = [hl for hl in final if not any(d in hl for d in exclude_domains)]
if len(remove_trailing_chars) > 0:
for n,hl in enumerate(final):
if any(t in hl for t in remove_trailing_chars):
t0 = sorted([hl.find(c) for c in remove_trailing_chars if c in hl])[0]
final[n] = hl[:t0]
if remove_3ld == True:
for n,hl in enumerate(final):
if hl.count('.') >= 2:
final[n] = hl[hl.find('.')+1:]
links_dict[cid] = tuple(final)
for i in links_dict:
links_dict[i] = tuple([j for j in collections.Counter(links_dict[i]).most_common() if j[1] >= min_ct])
if nodes_data != '':
return {i:links_dict[i] for i in links_dict if len(links_dict[i]) > 0}
else:
return [i for i in links_dict['1'] if i[1] >= min_ct]
# shared_ties_grid: arranges counts or proportions of ties shared within and between top communities in a network into a grid
# Description: shared_ties_grid arranges the output of _get_shared_ties into a list of lists which is printable as a grid.
# Arguments:
# ei_obj: a variable of type eiObject containing all the optional attributes.
# rec_sent: a flag determining whether the off-diagonal grid cells will represent received edges ('REC'), sent edges ('SENT'), or sent and received edges summed ('ALL'). Default is 'ALL'.
# calc_propor: If set to True, each cell value will represent a proportion of the total edges in the community indicated by index 0 of the given row. If set to False, shared_ties_grid will output edge counts. Default is False.
# invert: If set to True, the function will output the reciprocals of all the off-diagonal cell values and zeroes for all diagonal values. Cells whose reciprocals would result in division by zero will be assigned a value of 1. The contents of the top row and leftmost column will remain unaltered. If set to False, non-reciprocal values (i.e., shared-tie counts or proportions) will be output. Default is False.
# Output: shared_ties_grid outputs a list of lists in the following format:
# Indices 1 through k of the first row contain all k communities represented in the eiObject. Index 0 is left blank.
# The 0 indices of all remaining rows also contain all k communities represented in the eiObject. Thus the grid always has a size of k+1 x k+1.
# Each off-diagonal grid "cell" represents either the count, inverted count, or proportion (depending on how the flags are set) of edges received or sent (or both) by the community indicated by index 0 of the kth row from or to the community indicated on the kth index of the first row (the "column"). Diagonal grid cells represent the proportions or counts of internal edges of the community indicated by index 0 of the given row.
# Grids created by shared_ties_grid can easily be viewed in the shell using the following code (where ei is a variable of type eiObject created with the 'PROX' flag):
# testgrid = shared_ties_grid(ei)
# for i in testgrid:
# print(i)
def shared_ties_grid(ei_obj,rec_sent='ALL',calc_propor=False,invert=False):
if ei_obj.adj_in is None or ei_obj.adj_out is None:
return 'One or both adjacency matrices empty, cannot create grid :/'
if rec_sent.upper() == 'REC':
raw = ei_obj.adj_in
elif rec_sent.upper() == 'SENT':
raw = ei_obj.adj_out
else:
raw = {}
for i in ei_obj.adj_in:
raw[i] = {}
for j in ei_obj.adj_in:
if j in ei_obj.adj_in[i]:
in_add = ei_obj.adj_in[i][j]
else:
in_add = 0
if j in ei_obj.adj_out[i]:
out_add = ei_obj.adj_out[i][j]
else:
out_add = 0
raw[i][j] = in_add + out_add
raw2 = {}
for i in raw: #converts all subdict keys to ints
raw2[int(i)] = {}
for j in raw[i]:
raw2[int(i)][int(j)] = raw[i][j]
clist = sorted(list(raw2.keys()))
for i in raw2:
raw2[i][i] = ei_obj.internal_ties[str(i)] #add internal ties
for j in clist:
if j not in raw2[i]: #add zeroes for disconnected communities
raw2[i][j] = 0
raw3 = {}
for i in raw2: #put the subdict keys in numerical order
raw3[i] = collections.OrderedDict(sorted(raw2[i].items()))
raw3 = collections.OrderedDict(sorted(raw3.items())) #put the main dict keys in order
outlist = []
for n,i in enumerate(raw3):
if calc_propor == True: #if PROPOR flag is set, divide each value by total N of ties
outlist.append([round(j/ei_obj.total_ties[str(i)],3) for j in list(raw3[i].values())])
else:
outlist.append(list(raw3[i].values()))
outlist[n].insert(0,i)
outlist.insert(0,['']+clist)
print('Grid created.')
if invert == True:
recip = []
for n,i in enumerate(outlist):
recip.append([])
for x,j in enumerate(outlist[n]):
if n != x and n != 0 and x != 0:
try:
recip[n].append(format(1/outlist[n][x],'f'))
except ZeroDivisionError:
recip[n].append(1)
elif n == x and n != 0 and x != 0:
recip[n].append(0)
else:
recip[n].append(outlist[n][x])
return recip
else:
return outlist
def communities_as_nodes(nodes_data,
edges_data,
directed=False,
remove_selfloops=True):
nodes = load_data(nodes_data)