-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathunigramSentiWordNet.py
111 lines (78 loc) · 2.78 KB
/
unigramSentiWordNet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# this code is taken from the following website
# http://nlpforhackers.io/sentiment-analysis-intro/
# this code uses the unigram approach for SentiWordNet
# based off of this code we wrote a biram model for SentiWordNet
import pandas as pd
data = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
# 25000 movie reviews
import random
random.seed(5)
sentiment_data = list(zip(data["review"], data["sentiment"]))
random.shuffle(sentiment_data)
# 80% for training
train_X, train_y = zip(*sentiment_data[:20000])
# Keep 20% for testing
test_X, test_y = zip(*sentiment_data[20000:])
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag
lemmatizer = WordNetLemmatizer()
def penn_to_wn(tag):
"""
Convert between the PennTreebank tags to simple Wordnet tags
"""
if tag.startswith('J'):
return wn.ADJ
elif tag.startswith('N'):
return wn.NOUN
elif tag.startswith('R'):
return wn.ADV
elif tag.startswith('V'):
return wn.VERB
return None
def clean_text(text):
text = text.replace("<br />", " ")
return text
def swn_polarity(text):
"""
Return a sentiment polarity: 0 = negative, 1 = positive
"""
sentiment = 0.0
tokens_count = 0
text = clean_text(text)
raw_sentences = sent_tokenize(text)
for raw_sentence in raw_sentences:
tagged_sentence = pos_tag(word_tokenize(raw_sentence))
for word, tag in tagged_sentence:
wn_tag = penn_to_wn(tag)
if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
continue
lemma = lemmatizer.lemmatize(word, pos=wn_tag)
if not lemma:
continue
synsets = wn.synsets(lemma, pos=wn_tag)
if not synsets:
continue
# Take the first sense, the most common
synset = synsets[0]
swn_synset = swn.senti_synset(synset.name())
sentiment += swn_synset.pos_score() - swn_synset.neg_score()
tokens_count += 1
# judgment call ? Default to positive or negative
if not tokens_count:
return 0
# sum greater than 0 => positive sentiment
if sentiment >= 0:
return 1
# negative sentiment
return 0
# Since we're shuffling, you'll get diffrent results
print(swn_polarity(train_X[0]), train_y[0]) # 1 1
print(swn_polarity(train_X[1]), train_y[1]) # 0 0
print(swn_polarity(train_X[2]), train_y[2]) # 0 1
print(swn_polarity(train_X[3]), train_y[3]) # 1 1
print(swn_polarity(train_X[4]), train_y[4]) # 1 1`
from sklearn.metrics import accuracy_score
pred_y = [swn_polarity(text) for text in test_X]
print(accuracy_score(test_y, pred_y)) # 0.6518