-
Notifications
You must be signed in to change notification settings - Fork 0
/
nlp_restaurant_analysis.py
126 lines (88 loc) · 3.78 KB
/
nlp_restaurant_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Importing Libraries Numpy and Pandas
import numpy as np
import pandas as pd
# Saving the imported dataset in 'data' dataframe
data = pd.read_csv('C:\Users\Administrator\Desktop\Ishika\Work\NLP_ Restaurant_Review\Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
# Printing values of dataframe 'data'
data.head
# Importing Libraries NLTK - Natural Language Processing Toolkit and Regex
import nltk
import re
# Downloading and removing stopwords (words that do not add meaning to text) using Porter Stemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
# Defining corpus, our set of words to be judged on
corpus = []
for i in range(0,1000):
review = re.sub(pattern='[^a-zA-Z]', repl = ' ', string = data['Review'][i])
review = review.lower()
reviewgiven = review.split()
reviewgiven = [word for word in reviewgiven if not word in set(stopwords.words('english'))]
ps = PorterStemmer()
reviewgiven = [ps.stem(word) for word in reviewgiven]
review = ''.join(review)
corpus.append(review)
# Printing and checking first few values of the corpus
corpus[0:20]
# Tokenizes words and creates a sparse matrix for the frequency
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=500)
x = cv.fit_transform(corpus)
y = data.iloc[:, 1].values
# Splitting the dataset into Training and Testing data
from sklearn.model_selection import train_test_split
xtrain, xtest = train_test_split, ytrain, ytest = train_test_split(x, y, test_size = 0.40, random_state = 0)
# Importing Multinomial Naive Bias Classifier
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(xtrain, ytrain)
ypredict = classifier.predict(xtest = train_test_split)
# Calculating Accuracy, Precision and Recall Percentage
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
score1 = accuracy_score(ytest,ypredict)
score2 = precision_score(ytest,ypredict)
# Printing Accuracy, Precision and Recall Percentage
print("Accuracy:", score1*100,"%")
print("Precision:", score2*100,"%")
# Creating Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(ytest, ypredict)
print(cm)
# Hyperparameter tuning the Naive Bayes Classifier
best_accuracy = 0.0
alpha_val = 0.0
for i in np.arange(0.1,1.1,0.1):
temp_classifier = MultinomialNB(alpha=i)
temp_classifier.fit(xtrain, ytrain)
temp_y_pred = temp_classifier.predict(xtest = train_test_split)
score = accuracy_score(ytest, temp_y_pred)
print("Accuracy score for alpha={} is: {}%".format(round(i,1), round(score*100,4)))
if score>best_accuracy:
best_accuracy = score
alpha_val = i
print('The best accuracy is {}% with alpha value as {}'.format(round(best_accuracy*100, 4), round(alpha_val,1)))
classifier = MultinomialNB(alpha=0.4)
classifier.fit(xtrain, ytrain)
# Predicting Reviews
def predict_sentiment(sample_review):
sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ', string = sample_review)
sample_review = sample_review.lower()
sample_review_words = sample_review.split()
sample_review_words = [word for word in sample_review_words if not word in set(stopwords.words('english'))]
ps = PorterStemmer()
final_review = [ps.stem(word) for word in sample_review_words]
final_review = ' '.join(final_review)
temp = cv.transform([final_review]).toarray()
return classifier.predict(temp)
sample_review = 'The food is really good here.'
if predict_sentiment(sample_review):
print('This is a POSITIVE review.')
else:
print('This is a NEGATIVE review!')
sample_review = 'Food was pretty bad and the service was very slow.'
if predict_sentiment(sample_review):
print('This is a POSITIVE review.')
else:
print('This is a NEGATIVE review!')