-
Notifications
You must be signed in to change notification settings - Fork 0
/
functions.py
226 lines (197 loc) · 6.65 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
def parsePDF(filename, page_start, page_end):
'''
:param filename: PDF file to extract data from
:param page_start: integer - start page
:param page_end: integer - end page
:return: string with text
'''
import PyPDF2
# name of PDF file to parse (creates an object)
pdfFileObj = open(filename,'rb')
# pdfReader will be parsed
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
# get start and end pages
if not page_start:
start = 1
else:
start = page_start
if not page_end:
end = pdfReader.numPages
else:
end = page_end
# go through each page and extract text to string
text = ''
for i in range(start, end):
print("processed page" , str(i))
pageObj = pdfReader.getPage(i)
text += pageObj.extractText()
return(text)
def clean_word(word):
'''
Function to remove non alpha characters for a word
:param word: string
:return: cleaned string
'''
delete_chars = ''.join(c for c in map(chr, range(256)) if not c.isalnum())
return word.translate(None, delete_chars)
# def lematize_word(word, pos):
# '''
# Function uses WordNetLemmatizer
# :param word: string to lemmatize
# :param pos: n or v (part of speech to give context on how to lematize)
# :return: lemmatized string
# '''
# from nltk.stem import WordNetLemmatizer
# wnl = WordNetLemmatizer()
# token_lem = wnl.lemmatize(word, pos=pos)
#
# return token_lem
def lematize_word(word):
'''
Function to return lemma for a word - uses WordNetLemmatizer
1) find part of speech tag (pos)
2) convert penn pos to wordnet pos
3) return lemma based on tag
:param word: string to lemmatize
:return: lemmatized string
'''
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
if word == '':
pass
# print("word: {}".format(word))
res = pos_tag([word])
word, pos = res[0][0], res[0][1]
# convert to wordnet tag
tag = penn_to_wn(pos)
if tag:
wnl = WordNetLemmatizer()
token_lem = wnl.lemmatize(word, pos=tag)
return token_lem
else:
pass
def stem_word(word, stemmer):
'''
:param word:string to stem
:param stemmer: which stemmer to use: specify 'port', 'lancaster' or 'snowball'
:return: string
'''
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer
port = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer('english', ignore_stopwords=False)
if stemmer == 'port':
token_stem = port.stem(word=word)
elif stemmer == 'lancaster':
token_stem = lancaster.stem(word=word)
elif stemmer == 'snowball':
token_stem = snowball.stem(word=word)
return token_stem
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag
def penn_to_wn(tag):
"""
Function to convert from pentreebank pos tag to wordnet pos tag
credit: https://nlpforhackers.io/sentiment-analysis-intro/
:param tag: PennTreeBank tag
:return: WordNet tag
"""
if tag.startswith('J'):
return wn.ADJ
elif tag.startswith('N'):
return wn.NOUN
elif tag.startswith('R'):
return wn.ADV
elif tag.startswith('V'):
return wn.VERB
return None
def cleanup_text(text_string):
'''
Function to remove digits & punctuation from string (utf-8)
Also lower case
:param text_string: string to clean
:return: clean string
'''
import string, re
# since we have unicode
punct = dict((ord(char), None) for char in string.punctuation)
# clean up punctuation
clean_string = text_string.translate(punct)
# remove numbers
clean_string = re.sub(r'\d+', '', clean_string)
clean_string = clean_string.lower()
return clean_string
def get_tokens(text_string):
'''
Function that takes in text string and returns list of words
removes stop words & empty spaces & words with less then 2 characters
:param text_string: text string
:return: list of words
'''
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords as stopWords
stop_words = set(stopWords.words('English'))
# split words into list
tokens = word_tokenize(text_string)
# remove stop words
tokens = [word.lower() for word in tokens if word not in set(stopWords.words("English"))]
tokens = [word for word in tokens if word not in stop_words]
tokens = [word for word in tokens if word != '']
tokens = [word for word in tokens if len(word) > 2]
return tokens
def get_swn_sentiment(sentence):
"""
Function to retrieve sentiment polarity average between negative, positive sentiment based on SentiWordNet
Objective sentiment is not included in total score
Sentiment score = positive - negative score
Input: str
Output: float
"""
lemmatizer = WordNetLemmatizer()
for word, pos in pos_tag(word_tokenize(sentence)):
wn_tag = penn_to_wn(tag = pos)
if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
continue
lemma = lemmatizer.lemmatize(word, wn_tag)
synsets = swn.senti_synsets(lemma, pos=wn_tag)
if not synsets:
continue
synset = synsets[0]
# not using avg anymore
# avg_sentiment = np.nanmean([synset.pos_score(), synset.neg_score()])
sentiment = synset.pos_score() - synset.neg_score()
return(sentiment)
def get_swn_word_sentiment(token, tag):
"""
Function to retrieve sentiment polarity average between negative, positive sentiment based on SentiWordNet
Objective sentiment is not included in sentiment score
Sentiment score = positive - negative score
Input: str
Output: float
"""
senti_score = 0.0
lemmatizer = WordNetLemmatizer()
# convert to WordNet part of speech tag
wn_tag = penn_to_wn(tag=tag)
# other tags arent supported by sentiword
if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
pass
else:
lemma = lemmatizer.lemmatize(token, wn_tag)
if not lemma:
pass
else:
try:
# convert word to synset
synsets = swn.senti_synsets(lemma, pos=wn_tag)
if not synsets:
pass
else:
synset = synsets[0]
# return positive - negative sentiment score
senti_score = synset.pos_score() - synset.neg_score()
return senti_score
except:
pass