-
Notifications
You must be signed in to change notification settings - Fork 0
/
tweet_generator.py
125 lines (119 loc) · 6.65 KB
/
tweet_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import tensorflow as tf
import os
from textgenrnn import textgenrnn
import multiprocessing as mp
import time
import itertools
import ujson as json
import click
import numpy as np
import name_generator as ng
def generate(k):
np.random.seed()
textgen = textgenrnn('./weights/twitter_general_weights.hdf5')
text = textgen.generate(n=k, max_gen_length=140, return_as_list=True)
return text
@click.command()
@click.option('--infile', '-i',
required=True,
help='Enter the json file storing the original tweets (e.g. tweets.json).')
@click.option('--outfile', '-o',
required=True,
help='Enter the json file to store the fake tweets (e.g. fake_tweets.json).')
@click.option('--size', '-k', type=click.INT,
required=True,
help='Enter the batch size to generate.')
def main(infile, outfile, size):
# silence tensorflow
tf.logging.set_verbosity(tf.logging.ERROR)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
# generate tweets
print('Generating general fake tweets...')
start_time = time.time()
name_dict = {}
count = 0
with open(infile, 'r') as fin, open(outfile, 'w') as fout:
flag = True
while flag:
tweet = []
while len(tweet) < size:
try:
s = json.loads(next(fin))
if 'text' in s.keys():
tweet.append(s)
except (StopIteration, ValueError) as error:
print('Reached end of file!')
flag = False
break
count += len(tweet)
if len(tweet) % mp.cpu_count() == 0:
k = len(tweet) // mp.cpu_count()
else:
k = len(tweet) // mp.cpu_count() + 1
pool = mp.Pool(mp.cpu_count())
ftext = pool.map(generate, [k for _ in range(mp.cpu_count())])
pool.close()
ftext = list(itertools.chain.from_iterable(ftext))
for s, t in zip(tweet, ftext[:len(tweet)]):
s['text'] = t
uid = s['user']['id']
if uid not in name_dict.keys():
name = ng.gen_two_words(split=' ', lowercase=False)
screen_name = ''.join(name.split(' ')).lower() + ng.gen_year(1900, 2020) + ng.gen_birthday()
name_dict[uid] = {'name': name, 'screen_name': screen_name}
s['user']['name'] = name
s['user']['screen_name'] = screen_name
else:
s['user']['name'] = name_dict[uid]['name']
s['user']['screen_name'] = name_dict[uid]['screen_name']
for i in range(len(s['entities']['user_mentions'])):
mid = s['entities']['user_mentions'][i]['id']
if mid not in name_dict.keys():
name = ng.gen_two_words(split=' ', lowercase=False)
screen_name = ''.join(name.split(' ')).lower() + ng.gen_year(1900, 2020) + ng.gen_birthday()
name_dict[mid] = {'name': name, 'screen_name': screen_name}
s['entities']['user_mentions'][i]['name'] = name
s['entities']['user_mentions'][i]['screen_name'] = screen_name
else:
s['entities']['user_mentions'][i]['name'] = name_dict[mid]['name']
s['entities']['user_mentions'][i]['screen_name'] = name_dict[mid]['screen_name']
if 'quoted_status' in s.keys():
qid = s['quoted_status']['user']['id']
if qid not in name_dict.keys():
name = ng.gen_two_words(split=' ', lowercase=False)
screen_name = ''.join(name.split(' ')).lower() + ng.gen_year(1900, 2020) + ng.gen_birthday()
name_dict[qid] = {'name': name, 'screen_name': screen_name}
s['quoted_status']['user']['name'] = name
s['quoted_status']['user']['screen_name'] = screen_name
else:
s['quoted_status']['user']['name'] = name_dict[qid]['name']
s['quoted_status']['user']['screen_name'] = name_dict[qid]['screen_name']
for i in range(len(s['quoted_status']['entities']['user_mentions'])):
mid = s['quoted_status']['entities']['user_mentions'][i]['id']
if mid not in name_dict.keys():
name = ng.gen_two_words(split=' ', lowercase=False)
screen_name = ''.join(name.split(' ')).lower() + ng.gen_year(1900, 2020) + ng.gen_birthday()
name_dict[mid] = {'name': name, 'screen_name': screen_name}
s['quoted_status']['entities']['user_mentions'][i]['name'] = name
s['quoted_status']['entities']['user_mentions'][i]['screen_name'] = screen_name
else:
s['quoted_status']['entities']['user_mentions'][i]['name'] = name_dict[mid]['name']
s['quoted_status']['entities']['user_mentions'][i]['screen_name'] = name_dict[mid]['screen_name']
if 'extended_tweet' in s['quoted_status'].keys():
for i in range(len(s['quoted_status']['extended_tweet']['entities']['user_mentions'])):
mid = s['quoted_status']['extended_tweet']['entities']['user_mentions'][i]['id']
if mid not in name_dict.keys():
name = ng.gen_two_words(split=' ', lowercase=False)
screen_name = ''.join(name.split(' ')).lower() + ng.gen_year(1900, 2020) + ng.gen_birthday()
name_dict[mid] = {'name': name, 'screen_name': screen_name}
s['quoted_status']['extended_tweet']['entities']['user_mentions'][i]['name'] = name
s['quoted_status']['extended_tweet']['entities']['user_mentions'][i]['screen_name'] = screen_name
else:
s['quoted_status']['extended_tweet']['entities']['user_mentions'][i]['name'] = name_dict[mid]['name']
s['quoted_status']['extended_tweet']['entities']['user_mentions'][i]['screen_name'] = name_dict[mid]['screen_name']
line = json.dumps(s)
fout.write(line + '\n')
print('Processed records:', count)
print('Generate time: {} seconds'.format(time.time() - start_time))
if __name__ == '__main__':
main()