Skip to content

Commit 562b124

Browse files
committed
Adding local files
0 parents  commit 562b124

12 files changed

+1252
-0
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
*.zip
2+
*.pdf
3+
*.csv

BernoulliNaiveClassifier.py

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import numpy as np
2+
# Creating a class for bernoulli naive bayse classifier
3+
class BernoulliNaiveBayse():
4+
def __init__(self):
5+
print('Bernoulli Naive Bayse Classifier')
6+
# function for training of bernoulli naive bayse classification and predicting output of the test data
7+
def BNB_Classifier(self, X_train, Y_train, X_test):
8+
Smooth = 1
9+
NumFeat = X_train[0].size # number of features in the given train data
10+
UM=[]
11+
ClassPr = []
12+
FeatPr = []
13+
LenM = Y_train.size # length of measurment vector, Y
14+
UM.append(np.unique(Y_train))
15+
LenF = X_train[0].size # length of feature vector, X
16+
NumClass = int(UM[0].size) # number of classes in the given train data
17+
Cfeats = {} # defining a dictionary for count of each feature
18+
Cclass = {} # defining a dictionary for count of each class
19+
# constructing count of each features matrix then obtaining class probability and features probability
20+
for ind in range(LenM):
21+
if Y_train[ind] not in Cfeats:
22+
Cfeats[Y_train[ind]] = [0 for j in range (LenF)]
23+
for ind in range(LenM):
24+
for con in range(LenF):
25+
Cfeats[Y_train[ind]][con] += X_train[ind][con]
26+
for ind in range(LenM):
27+
if Y_train[ind] in Cclass:
28+
Cclass[Y_train[ind]] += 1
29+
else:
30+
Cclass[Y_train[ind]] = 1
31+
for CN in Cfeats:
32+
YtrainSize=int(LenM)
33+
temp = np.array([])
34+
ClassPr.append(float((float((Cclass[CN] + Smooth)))/(float((YtrainSize + (NumClass * Smooth))))))
35+
for i in range(LenF):
36+
temp = np.append(temp , float(((Cfeats[CN][i] + Smooth))/float((Cclass[CN]+(2*Smooth)))))
37+
FeatPr.append(temp)
38+
PC = ClassPr
39+
PF = FeatPr
40+
NC = NumClass
41+
NF = NumFeat
42+
Output = np.array([])
43+
for i in range(X_test.shape[0]):
44+
ClassID = 0 # ID of Class
45+
prob_max = -10**10 # maximum probability
46+
prob = 0 # probability of each working feature
47+
for CN in range(NC):
48+
prob = np.log(PC[CN])
49+
for j in range(NF):
50+
Curent_Class_ID = X_test[i][j]
51+
if(Curent_Class_ID == 0):
52+
prob += np.log(1-PF[CN][j])
53+
else:
54+
prob += np.log(PF[CN][j])
55+
if(prob > prob_max):
56+
prob_max = prob
57+
ClassID = CN
58+
Output = np.append(Output , ClassID)
59+
return Output
60+
61+
62+
63+

NBSVM.py

+157
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
from NBSVMpreprocessing import create_bow, build_vocab
2+
import numpy as np
3+
from sklearn.svm import LinearSVC
4+
import time
5+
"""
6+
Naive Bayes Support Vector Machine interpolation, NBSVM.
7+
"""
8+
9+
## tuning_params:
10+
gram = 2
11+
C = 100
12+
beta = 0.25
13+
alpha = 1
14+
15+
16+
"""
17+
Trains the Multinomial Naive Bayes Model
18+
"""
19+
def train_nb(vocab_list, df):
20+
21+
#find prior = total positive examples/total examples
22+
total_sents = len(df['label'])
23+
pos_sents = 0
24+
neg_sents = 0
25+
for i in range(len(df['label'])):
26+
if(df['label'][i] == 1):
27+
pos_sents += 1
28+
neg_sents = total_sents - pos_sents
29+
30+
#initiate counts for word appearance conditional on label == 1 and label == 0
31+
#alpha is laplacian smoothing parameter
32+
pos_list = np.ones(len(vocab_list)) * alpha
33+
neg_list = np.ones(len(vocab_list)) * alpha
34+
35+
for sentence, label in zip(df['sentence'], df['label']):
36+
bow = create_bow(sentence, vocab_list, gram)
37+
38+
if label == 1:
39+
pos_list += bow
40+
else:
41+
neg_list += bow
42+
43+
#Calculate log-count ratio
44+
x = (pos_list/abs(pos_list).sum())
45+
y = (neg_list/abs(neg_list).sum())
46+
r = np.log(x/y)
47+
b = np.log(pos_sents/neg_sents)
48+
49+
return r, b
50+
51+
"""
52+
Trains the (linear-kernel) SVM with L2 Regularization
53+
"""
54+
def train_svm(vocab_list, df_train, c, r):
55+
# clf = LinearSVC(C=c, class_weight=None, dual=False, fit_intercept=True,
56+
# loss='squared_hinge', max_iter=1000,
57+
# multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
58+
# verbose=0)
59+
print('creating SVM model')
60+
clf = LinearSVC(C=c)
61+
print('creating training matrix')
62+
M = np.array([])
63+
X = np.zeros((len(df_train['sentence']), len(vocab_list)))
64+
bow = np.array([])
65+
con = 0
66+
for sentence in df_train['sentence']:
67+
print('iteration: {}'.format(con+1))
68+
bow = create_bow(sentence, vocab_list, gram)
69+
M = r * bow
70+
for i in range(len(M)):
71+
X[con, i] = M[i]
72+
# X.append(M)
73+
con=con+1
74+
#X = np.array([(r * create_bow(sentence, vocab_list, gram)) for sentence in df_train['sentence']])
75+
y = df_train['label']
76+
77+
clf.fit(X, y)
78+
svm_coef = clf.coef_
79+
svm_intercept = clf.intercept_
80+
81+
return svm_coef, svm_intercept, clf
82+
83+
"""
84+
Predict classification with MNB
85+
"""
86+
def predict(df_test, w, b, vocab_list):
87+
total_sents = len(df_test['label'])
88+
total_score = 0
89+
90+
for sentence, label in zip(df_test['sentence'], df_test['label']):
91+
bow = create_bow(sentence, vocab_list, gram)
92+
93+
result = np.sign(np.dot(bow, w.T) + b)
94+
if result == -1:
95+
result = 0
96+
if result == label:
97+
total_score +=1
98+
99+
return total_score/total_sents
100+
101+
"""
102+
Predict classification with NB-SVM
103+
"""
104+
def predict_nbsvm(df_test, svm_coef, svm_intercept, r, b, vocab_list):
105+
total_sents = len(df_test['label'])
106+
total_score = 0
107+
108+
for sentence, label in zip(df_test['sentence'], df_test['label']):
109+
bow = r * create_bow(sentence, vocab_list, gram)
110+
w_bar = (abs(svm_coef).sum())/len(vocab_list)
111+
w_prime = (1 - beta)*(w_bar) + (beta * svm_coef)
112+
result = np.sign(np.dot(bow, w_prime.T) + svm_intercept)
113+
if result == -1:
114+
result = 0
115+
if result == label:
116+
total_score +=1
117+
118+
return total_score/total_sents
119+
120+
121+
122+
if __name__ == "__main__":
123+
124+
time_first = time.time()
125+
print("Building Dataset...")
126+
vocab_list, df_train, df_val, df_test = build_vocab(gram)
127+
128+
129+
print("Training Multinomial Naive Bayes...")
130+
r, b = train_nb(vocab_list, df_train)
131+
132+
#Train SVM
133+
print("Training LinearSVM...")
134+
svm_coef, svm_intercept, clf = train_svm(vocab_list, df_train, C, r)
135+
136+
137+
#Test Models
138+
print("Test using NBSVM ({:.4f}-gram):".format(gram))
139+
accuracy = predict_nbsvm(df_val, svm_coef, svm_intercept, r, b, vocab_list)
140+
print("Beta: {} Accuracy: {}".format(beta, accuracy))
141+
142+
print("Test using MNB ({:.4f}-gram):".format(gram))
143+
mnb_acc = predict(df_val, r, b, vocab_list)
144+
print("Accuracy: {}".format(mnb_acc))
145+
146+
147+
148+
149+
150+
151+
152+
153+
154+
155+
156+
157+

NBSVMpreprocessing.py

+131
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
from nltk.corpus import stopwords
2+
import numpy as np
3+
import os
4+
from sklearn.model_selection import train_test_split
5+
6+
7+
def create_bow(sentence, vocab_list, gram):
8+
word_list = tokenize(sentence, gram)
9+
bow = np.zeros(len(vocab_list))
10+
11+
for word in word_list:
12+
if word in vocab_list:
13+
bow[vocab_list[word]] = 1
14+
return bow
15+
16+
def rm_stopwords(word_list):
17+
return [word for word in word_list if word not in stopwords.words('english')]
18+
19+
def tokenize(sent, grams):
20+
words_list = rm_stopwords(sent.split())
21+
sent_tok = []
22+
for gram in range(1, grams + 1):
23+
for i in range(len(words_list) + 1 - gram):
24+
sent_tok.append("-".join(words_list[i:i + gram]))
25+
return sent_tok
26+
27+
"""
28+
Loads the raw data
29+
"""
30+
def build_vocab(gram):
31+
32+
#Load data
33+
cwd = os.getcwd()
34+
# the paths of data
35+
data_path_training_neg = cwd + r'\train\neg'
36+
data_path_training_pos = cwd + r'\train\pos'
37+
data_path_test = cwd + r'\test'
38+
File_names_neg =[];
39+
File_names_pos =[];
40+
File_names_test = []
41+
42+
# training and test variable definition
43+
Training_data_text=[]
44+
Training_data_class=[]
45+
Test_data_text=[]
46+
47+
# reading neg training data from txt files
48+
for root, dirs, files in os.walk(data_path_training_neg):
49+
for name in files:
50+
if name.endswith((".txt")):
51+
File_names_neg.append(name)
52+
File_names_sorted_neg = sorted(File_names_neg,key = lambda x: int(x.split('_')[0]))
53+
for i in range(len(File_names_sorted_neg)):
54+
txtfile_path = data_path_training_neg + '\\' + File_names_sorted_neg[i]
55+
d = open(txtfile_path, 'r', encoding="utf8")
56+
Training_data_text.append(d.read())
57+
Training_data_class.append(0)
58+
d.close()
59+
60+
# reading pos training data from txt files
61+
for root, dirs, files in os.walk(data_path_training_pos):
62+
for name in files:
63+
if name.endswith((".txt")):
64+
File_names_pos.append(name)
65+
File_names_sorted_pos = sorted(File_names_pos,key = lambda x: int(x.split('_')[0]))
66+
for i in range(len(File_names_sorted_pos)):
67+
txtfile_path = data_path_training_pos + '\\' + File_names_sorted_pos[i]
68+
d = open(txtfile_path, 'r', encoding="utf8")
69+
Training_data_text.append(d.read())
70+
Training_data_class.append(1)
71+
d.close()
72+
73+
74+
# reading test data from txt files
75+
for root, dirs, files in os.walk(data_path_test):
76+
for name in files:
77+
if name.endswith((".txt")):
78+
File_names_test.append(name)
79+
File_names_sorted = sorted(File_names_test)
80+
for i in range(len(File_names_sorted)):
81+
txtfile_path = data_path_test + '\\' + File_names_sorted[i]
82+
d = open(txtfile_path, 'r', encoding="utf8")
83+
Test_data_text.append(d.read())
84+
d.close()
85+
86+
# spliting train and valid data
87+
X_train, X_val, Y_train, Y_val = train_test_split(Training_data_text, Training_data_class, train_size = 0.8, test_size = 0.2)
88+
89+
# creating training data dictionary
90+
Training_data_dic = {}
91+
for i in range(len(X_train)):
92+
Training_data_dic.setdefault('sentence', []).append(X_train[i])
93+
Training_data_dic.setdefault('label', []).append(Y_train[i])
94+
# creating validation data dictionary
95+
Validation_data_dic = {}
96+
for i in range(len(X_val)):
97+
Validation_data_dic.setdefault('sentence', []).append(X_val[i])
98+
Validation_data_dic.setdefault('label', []).append(Y_val[i])
99+
# creating test data dictionary
100+
Test_data_dic = {}
101+
for i in range(len(Test_data_text)):
102+
Test_data_dic.setdefault('sentence', []).append(Test_data_text[i])
103+
104+
105+
# creating vocabulary dictionary
106+
word_count = 0
107+
vocab_list = {}
108+
109+
#Create vocab set
110+
vocab_set = set()
111+
for sentence in Training_data_dic['sentence']:
112+
word_list = tokenize(sentence, gram)
113+
word_list_reduced = []
114+
for i in range(len(word_list)):
115+
# if word_list[i] not in vocab_set:
116+
word_list_reduced.append(word_list[i])
117+
vocab_set.update(word_list_reduced)
118+
119+
#Assign each word a unique index
120+
for word in vocab_set:
121+
vocab_list[word] = word_count
122+
word_count += 1
123+
124+
df_train = Training_data_dic
125+
df_val = Validation_data_dic
126+
df_test = Test_data_dic
127+
128+
return vocab_list, df_train, df_val, df_test
129+
130+
131+

0 commit comments

Comments
 (0)