-
Notifications
You must be signed in to change notification settings - Fork 0
/
tweetManip.py
295 lines (267 loc) · 10.8 KB
/
tweetManip.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
from nltk.corpus import wordnet as wn
from random import randint
from hyphen import Hyphenator, dict_info
from hyphen.dictools import *
import sys
import re
import copy
class buildHaiku():
def __init__(self):
self.finalHaiku = list()
self.seasonWordPath = 'winterWords.txt'
self.keyWord = relatedWords()
self.listLength = 100
self.syns = []
self.ants = []
self.seasons = []
self.line1 = []
self.line2 = []
self.line3 = []
self.synsFound = False
self.antsFound = False
self.seasonsFound = False
def setWord(self, wordText):
self.keyWord = relatedWords(wordText)
self.seasons = open(self.seasonWordPath).read().splitlines()
syns = self.keyWord.buildWordList(True, self.listLength)
ants = self.keyWord.buildWordList(False, len(syns))
syns = syns[:len(ants)]
self.seasons = self.seasons[:len(ants)]
self.syns = [w.replace('_', ' ') for w in syns]
self.ants = [w.replace('_', ' ') for w in ants]
return self.syns + self.ants + self.seasons
def newTweet(self, tweetText):
tweetObj = checkTweet(tweetText)
print "----checking tweet: ", tweetObj.rawText
if not tweetObj.qualityControl():
print "the tweet did not pass QC"
return list()
if not self.classifyTweet(tweetObj.checkSylbls(7), 7):
self.classifyTweet(tweetObj.checkSylbls(5), 5)
if self.line1 and self.line2 and self.line3:
haikuLines = [" ".join(self.line1), " ".join(self.line2), " ".join(self.line3)]
return "\n".join(haikuLines)
else:
return list()
def classifyTweet(self, tweetWordList, Nsyls):
if not tweetWordList:
return False
singleStr = " ".join(tweetWordList)
# chech if it is a synonym
if any([re.search(word, singleStr) for word in self.syns]) and not self.synsFound:
if Nsyls == 7 and not self.line2:
self.line2 = tweetWordList
self.synsFound = True
elif Nsyls == 5 and not self.line1:
self.line1 = tweetWordList
self.synsFound = True
elif Nsyls == 5 and not self.line3:
self.line3 = tweetWordList
self.synsFound = True
if self.synsFound:
print 'this tweet will be the synonym'
return True
# chech if it is an antonym
if any([re.search(word, singleStr) for word in self.ants]) and not self.antsFound:
if Nsyls == 7 and not self.line2:
self.line2 = tweetWordList
self.antsFound = True
elif Nsyls == 5 and not self.line3:
self.line3 = tweetWordList
self.antsFound = True
elif Nsyls == 5 and not self.line1:
self.line1 = tweetWordList
self.antsFound = True
if self.antsFound:
print 'this tweet will be the antonym'
return True
# chech if it is an season
if any([re.search(word, singleStr) for word in self.seasons]) and not self.seasonsFound:
if Nsyls == 7 and not self.line2:
self.line2 = tweetWordList
self.seasonsFound = True
elif Nsyls == 5 and not self.line1:
self.line1 = tweetWordList
self.seasonsFound = True
elif Nsyls == 5 and not self.line3:
self.line3 = tweetWordList
self.seasonsFound = True
if self.seasonsFound:
print 'this tweet will be the season'
return True
return False
class checkTweet():
def __init__(self, text = 'Defualt Tweet'):
# only keep latin chars:
self.rawText = re.sub(ur'[^\x00-\x7F]', u'', text)
self.textWords = self.rawText.split()
self.h_en = Hyphenator('en_US')
self.badSymbols = ['http:', 'https:', '&']
self.forbiddenThings = ['@'] # random syms
self.forbiddenWords = ['el', 'la', 'en', 'tu', # spanish
'Et', 'le', 'aux', 'les', 'de', 'des', 'du', 'il', 'Elle',
'ses', 'sa', 'ces', 'cela', 'est', 'vous', 'tous', 'nous',
'allez', 'alons'] # french
self.forbiddenEnds = ['the', 'and', 'a', 'an', 'for', 'at', 'except', 'or', 'has',
'my', 'your', 'their', 'his', 'hers', 'her\'s', 'get', 'it\'ll', 'to', 'like',
'is', 'I']
def qualityControl(self):
self.replaceText()
self.remove_at_symbol_first()
self.remove_symbolWords()
if self.check_forbiddenThings():
return False
print "post QC tweet: ", " ".join(self.textWords)
return True
def replaceText(self):
self.textWords = [w.replace('#', 'hashtag ') for w in self.textWords]
def remove_at_symbol_first(self):
if re.search('RT', self.textWords[0]):
del self.textWords[0]
if re.search('@', self.textWords[0]):
del self.textWords[0]
def remove_symbolWords(self):
# remove words with badSymbols
for i, word in enumerate(self.textWords):
for s in self.badSymbols:
if re.search(s, word):
del self.textWords[i]
break
def words_no_vowels(self, wordList):
for word in wordList:
if not re.search("([aeiouyAEIOUY]+)", word):
print word, ' - did not contain any vowels'
return True
return False
def check_forbiddenThings(self):
for s in self.forbiddenThings:
if any([re.search(s, word) for word in self.textWords]):
print 'the forbidden thing: ', s, ' was found'
return True
for s in self.forbiddenWords:
if any([re.search('^'+s+'$', word, re.IGNORECASE) for word in self.textWords]):
print 'the forbidden word: ', s, ' was found'
return True
return False
def checkSylbls(self, Nsyls):
finalWords = self.confirmSylsCounts(Nsyls)
if not finalWords or self.words_no_vowels(finalWords) \
or any(finalWords[-1] == s for s in self.forbiddenEnds):
return list()
print Nsyls, "syls found... final text: ", finalWords
return finalWords
def confirmSylsCounts(self, Nsyls):
nWords = len(self.textWords)
i = 0
sylsCount = 0;
tooHard = False;
# loop until the end of the word list, we count Nsyls or can't figure out a word
while i < nWords and sylsCount < Nsyls and not tooHard:
if len(self.textWords[i]) >= 100: #hyphenator will break and something is crazy
return list()
libreSyls = len(self.h_en.syllables(self.textWords[i]))
libreSyls = max(libreSyls, 1)
simplSyls = self.count_syllables(self.textWords[i])
if libreSyls == simplSyls[0] or libreSyls == simplSyls[1]:
sylsCount = sylsCount + libreSyls
elif simplSyls[0] == simplSyls[1]:
sylsCount = sylsCount + simplSyls[1]
else: # this tweet is too hard
tooHard = True
i += 1
if (sylsCount == Nsyls) and not tooHard:
return self.textWords[:i]
else:
return list()
def count_syllables(self, word):
if not word:
return 0, 0
vowels = ['a', 'e', 'i', 'o', 'u']
on_vowel = False
in_diphthong = False
minsyl = 0
maxsyl = 0
lastchar = None
word = word.lower()
for c in word:
is_vowel = c in vowels
if on_vowel == None:
on_vowel = is_vowel
# y is a special case
if c == 'y':
is_vowel = not on_vowel
if is_vowel:
if not on_vowel:
# We weren't on a vowel before.
# Seeing a new vowel bumps the syllable count.
minsyl += 1
maxsyl += 1
elif on_vowel and not in_diphthong and c != lastchar:
# We were already in a vowel.
# Don't increment anything except the max count,
# and only do that once per diphthong.
in_diphthong = True
maxsyl += 1
on_vowel = is_vowel
lastchar = c
# Some special cases:
if word[-1] == 'e':
minsyl -= 1
# if it ended with a consonant followed by y, count that as a syllable.
if word[-1] == 'y' and not on_vowel:
maxsyl += 1
return minsyl, maxsyl
class relatedWords():
def __init__(self, centerWord = 'good'):
self.centerWord = centerWord
def buildWordList(self, synonyms, length):
if not synonyms:
finalWord = self._unrelated_word(self.centerWord)
else:
finalWord = self.centerWord
Nlast = 0
desperation = 0
Ntries = 0
thisTry = []
while length > len(thisTry):
if Ntries > desperation:
desperation = desperation + 1
Ntries = 1
if desperation > 10:
print '10 recursions only yeilded: ', len(thisTry), ' words!'
break
else:
Ntries = Ntries + 1
thisTry = self._related_words(finalWord, 0, desperation)
return thisTry[:length]
def _related_words(self, word, curDepth, targetDepth):
Nsynsets = len(wn.synsets(word))
if Nsynsets == 0:
return word
groupInd = randint(0, Nsynsets -1)
outputList = copy.copy(wn.synsets(word)[groupInd].lemma_names())
if curDepth == targetDepth:
return outputList
else:
finalList = []
for curWord in outputList:
finalList = finalList + self._related_words(curWord, curDepth+1, targetDepth)
return list(set(finalList))
def _unrelated_word(self, word):
synlist_all = []
for item in self._related_words(word, 0, 0):
synlist_all = synlist_all + wn.synsets(item)
unique = list(set(synlist_all))
synlist_all2 = []
for item in unique:
synlist_all2 = synlist_all2 + item.lemmas()
antonym_list = []
for item in synlist_all2:
antonym_list = antonym_list + item.antonyms()
antonym_list2 = list()
for item in antonym_list:
antonym_list2 = antonym_list2 + item.synset().lemma_names()
if antonym_list2:
return antonym_list2[0]
else:
return word