Skip to content

Commit 37501dc

Browse files
committed
first commit
0 parents  commit 37501dc

9 files changed

+1374
-0
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
/.ipynb_checkpoints
2+
/.idea
3+
/data

bayes_calssifier.py

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Naive Bayes classification for sentiment analysis
2+
3+
import random
4+
5+
import nltk
6+
from nltk.corpus import movie_reviews
7+
8+
# form a document with all reviews and their categories(class labels)
9+
print("Starting with pre-processing and feature extraction")
10+
documents = [(list(movie_reviews.words(file_id)), category)
11+
for category in movie_reviews.categories()
12+
for file_id in movie_reviews.fileids(category)]
13+
random.shuffle(documents)
14+
15+
# find the frequency of all words in complete documents and take top 2000 words
16+
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
17+
word_features = list(all_words)[:2000]
18+
19+
20+
# extract feature vector for a document.(Feature for each word is indication whether the document contains that word)
21+
def document_features(document):
22+
document_word = set(document) # set of words in a document for faster computation
23+
features = {} # size = (2000,)
24+
for word in word_features:
25+
features['contains({})'.format(word)] = word in document_word # check if each word in word_features is present in a document
26+
return features
27+
28+
29+
# Train Naive Bayes classifier
30+
feature_sets = [(document_features(d), c) for (d, c) in documents]
31+
print("Feature extraction for documents done")
32+
train_set, test_set = feature_sets[100:], feature_sets[:100]
33+
34+
print("Training classifier")
35+
classifier = nltk.NaiveBayesClassifier.train(train_set)
36+
37+
# Test the classifier
38+
print("Classifier accuracy: {}\n".format(nltk.classify.accuracy(classifier, test_set)))
39+
40+
# Show the most important features as interpreted by Naive Bayes
41+
classifier.show_most_informative_features(5)

extras/7_Sent.pdf

803 KB
Binary file not shown.

logistic_regression_1.ipynb

+220
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"### https://towardsdatascience.com/sentiment-analysis-with-python-part-1-5ce197074184"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": 1,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"import re\n",
17+
"\n",
18+
"from sklearn.feature_extraction.text import CountVectorizer\n",
19+
"from sklearn.linear_model import LogisticRegression\n",
20+
"from sklearn.metrics import accuracy_score\n",
21+
"from sklearn.model_selection import train_test_split"
22+
]
23+
},
24+
{
25+
"cell_type": "code",
26+
"execution_count": 2,
27+
"metadata": {},
28+
"outputs": [
29+
{
30+
"data": {
31+
"text/plain": [
32+
"'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as \"Teachers\". My 35 years in the teaching profession lead me to believe that Bromwell High\\'s satire is much closer to reality than is \"Teachers\". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\\'t!'"
33+
]
34+
},
35+
"execution_count": 2,
36+
"metadata": {},
37+
"output_type": "execute_result"
38+
}
39+
],
40+
"source": [
41+
"reviews_test = []\n",
42+
"reviews_train = []\n",
43+
"\n",
44+
"for line in open('data/aclImdb/movie_data/full_train.txt', encoding=\"utf8\"):\n",
45+
" reviews_train.append(line.strip())\n",
46+
"reviews_train[0]"
47+
]
48+
},
49+
{
50+
"cell_type": "code",
51+
"execution_count": 3,
52+
"metadata": {},
53+
"outputs": [
54+
{
55+
"data": {
56+
"text/plain": [
57+
"\"I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge.\""
58+
]
59+
},
60+
"execution_count": 3,
61+
"metadata": {},
62+
"output_type": "execute_result"
63+
}
64+
],
65+
"source": [
66+
"for line in open('data/aclImdb/movie_data/full_test.txt', encoding=\"utf8\"):\n",
67+
" reviews_test.append(line.strip())\n",
68+
"reviews_test[0]"
69+
]
70+
},
71+
{
72+
"cell_type": "code",
73+
"execution_count": 4,
74+
"metadata": {},
75+
"outputs": [],
76+
"source": [
77+
"REPLACE_NO_SPACE = re.compile(\"[.;:!\\'?,\\\"()\\[\\]]\")\n",
78+
"REPLACE_WITH_SPACE = re.compile(\"(<br\\s*/><br\\s*/>)|(\\-)|(\\/)\")\n",
79+
"\n",
80+
"def preprocess_reviews(reviews):\n",
81+
" reviews = [REPLACE_WITH_SPACE.sub(\" \", REPLACE_NO_SPACE.sub(\"\", review.lower())) for review in reviews]\n",
82+
" return reviews\n",
83+
"reviews_train_clean = preprocess_reviews(reviews_train)\n",
84+
"reviews_test_clean = preprocess_reviews(reviews_test)"
85+
]
86+
},
87+
{
88+
"cell_type": "code",
89+
"execution_count": 5,
90+
"metadata": {},
91+
"outputs": [],
92+
"source": [
93+
"cv = CountVectorizer(binary=True)\n",
94+
"cv.fit(reviews_train_clean)\n",
95+
"X = cv.transform(reviews_train_clean)\n",
96+
"X_test = cv.transform(reviews_test_clean)"
97+
]
98+
},
99+
{
100+
"cell_type": "code",
101+
"execution_count": 6,
102+
"metadata": {},
103+
"outputs": [],
104+
"source": [
105+
"target = [1 if x < 12500 else 0 for x in range(25000)]\n",
106+
"X_train, X_val, y_train, y_val = train_test_split(X, target, train_size=0.75)"
107+
]
108+
},
109+
{
110+
"cell_type": "code",
111+
"execution_count": 7,
112+
"metadata": {},
113+
"outputs": [
114+
{
115+
"name": "stdout",
116+
"output_type": "stream",
117+
"text": [
118+
"Accuracy for C=0.01: 0.87232\n",
119+
"Accuracy for C=0.05: 0.88128\n",
120+
"Accuracy for C=0.25: 0.88608\n",
121+
"Accuracy for C=0.5: 0.88336\n",
122+
"Accuracy for C=1: 0.88096\n"
123+
]
124+
}
125+
],
126+
"source": [
127+
"for c in [0.01, 0.05, 0.25, 0.5, 1]:\n",
128+
" lr = LogisticRegression(C=c, max_iter=500)\n",
129+
" lr.fit(X_train, y_train)\n",
130+
" print(\"Accuracy for C=%s: %s\" % (c, accuracy_score(y_val, lr.predict(X_val))))"
131+
]
132+
},
133+
{
134+
"cell_type": "code",
135+
"execution_count": 8,
136+
"metadata": {},
137+
"outputs": [
138+
{
139+
"name": "stdout",
140+
"output_type": "stream",
141+
"text": [
142+
"Final Accuracy: 0.88172\n"
143+
]
144+
}
145+
],
146+
"source": [
147+
"final_model = LogisticRegression(C=0.05, max_iter=200)\n",
148+
"final_model.fit(X, target)\n",
149+
"print(\"Final Accuracy: %s\" % accuracy_score(target, final_model.predict(X_test)))"
150+
]
151+
},
152+
{
153+
"cell_type": "code",
154+
"execution_count": 9,
155+
"metadata": {},
156+
"outputs": [
157+
{
158+
"name": "stdout",
159+
"output_type": "stream",
160+
"text": [
161+
"('excellent', 0.928930721254566)\n",
162+
"('perfect', 0.7924073691466867)\n",
163+
"('great', 0.6738714624335574)\n",
164+
"('amazing', 0.6128669705071148)\n",
165+
"('superb', 0.6007357897217929)\n"
166+
]
167+
}
168+
],
169+
"source": [
170+
"feature_to_coef = {word: coef for word, coef in zip(cv.get_feature_names(), final_model.coef_[0])}\n",
171+
"\n",
172+
"for best_positive in sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:5]:\n",
173+
" print(best_positive)"
174+
]
175+
},
176+
{
177+
"cell_type": "code",
178+
"execution_count": 10,
179+
"metadata": {},
180+
"outputs": [
181+
{
182+
"name": "stdout",
183+
"output_type": "stream",
184+
"text": [
185+
"('worst', -1.364739117410738)\n",
186+
"('waste', -1.166566080710549)\n",
187+
"('awful', -1.03207672028567)\n",
188+
"('poorly', -0.873759054451777)\n",
189+
"('boring', -0.8568152889177791)\n"
190+
]
191+
}
192+
],
193+
"source": [
194+
"for best_negative in sorted(feature_to_coef.items(), key=lambda x: x[1])[:5]:\n",
195+
" print(best_negative)"
196+
]
197+
}
198+
],
199+
"metadata": {
200+
"kernelspec": {
201+
"display_name": "Python 3",
202+
"language": "python",
203+
"name": "python3"
204+
},
205+
"language_info": {
206+
"codemirror_mode": {
207+
"name": "ipython",
208+
"version": 3
209+
},
210+
"file_extension": ".py",
211+
"mimetype": "text/x-python",
212+
"name": "python",
213+
"nbconvert_exporter": "python",
214+
"pygments_lexer": "ipython3",
215+
"version": "3.7.7"
216+
}
217+
},
218+
"nbformat": 4,
219+
"nbformat_minor": 4
220+
}

0 commit comments

Comments
 (0)