-
Notifications
You must be signed in to change notification settings - Fork 0
/
Sentence_oneline.py
executable file
·70 lines (61 loc) · 1.85 KB
/
Sentence_oneline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import os,re,sys,csv,ast
import pprint as pp
import numpy as np
file_len = []
fs = open("save_words.txt", 'r')
saved_words = [line.rstrip('\n') for line in fs]
#pp.pprint(lines)
word_list = []
for sw in saved_words:
m = re.search(r'(\(\')(\w+)(\'(.*))',sw)
word_list.append(m.group(2))
for word in word_list:
#-------------Put in one sentence-------------------#
f = open("Seperate_sensitive_tweets/%s.txt"%word,'r')
lines = [line.rstrip('\n') for line in f]
#pp.pprint(lines)
i = 0
while True:
try:
if lines[i] == '':
lines.remove(lines[i]);
#print("1: ", lines[i])
elif not re.search(r'Sentence\:\s',lines[i]):
lines[i-1] = lines[i-1]+' '+lines[i]
lines.remove(lines[i])
#print("2: ", lines[i])
else:
i += 1
except IndexError:
#print("******** Finish processing one file ! *********")
break
#pp.pprint(lines)
#------------Delete duplicated tweets-------------------#
uniqlines = list(set(lines))
#pp.pprint(uniqlines)
#-------------Write to the new file----------------#
o_path = "Processed_seperate_tweets"
if not os.path.exists(o_path):
os.mkdir(o_path)
f_out = open(o_path+"/%s.txt"%word, 'w')
f_out.writelines(["%s\n" %item for item in list(uniqlines)])
#----------------Rank and select based on rank--------------#
weight_list = [] #store keyword length of each sentence
for l in range(0, len(uniqlines)):
try:
k = re.search(r'(Keyword\:\s)(\[(.*)\])', uniqlines[l])
new_k = ast.literal_eval(k.group(2))
weight_list.append(len(new_k))
except:
print(sys.exc_info()[0], l, uniqlines[l])
continue
#print(sorted_kw_dic)
weight_list = np.array(weight_list)
wsum = np.sum(weight_list)
a = len(weight_list)
file_len.append(a)
#print("****Length of file %s is %s"%(word, a))
norm = weight_list/float(wsum)
select = np.random.choice(a, 10, p=norm)
#print(select)
print("file_len: ", file_len)