AminAnsarian
diff --git a/‎.gitignore
+3 b/‎.gitignore
+3
diff --git a/‎BernoulliNaiveClassifier.py
+63 b/‎BernoulliNaiveClassifier.py
+63
diff --git a/‎NBSVM.py
+157 b/‎NBSVM.py
+157
diff --git a/‎NBSVMpreprocessing.py
+131 b/‎NBSVMpreprocessing.py
+131
@@ -0,0 +1,3 @@
+*.zip
+*.pdf
+*.csv
@@ -0,0 +1,63 @@
+import numpy as np
+# Creating a class for bernoulli naive bayse classifier
+class BernoulliNaiveBayse():
+    def __init__(self):
+        print('Bernoulli Naive Bayse Classifier')
+    # function for training of bernoulli naive bayse classification and predicting output of the test data
+    def BNB_Classifier(self, X_train, Y_train, X_test):
+        Smooth = 1
+        NumFeat = X_train[0].size # number of features in the given train data
+        UM=[]
+        ClassPr = []
+        FeatPr = []
+        LenM = Y_train.size # length of measurment vector, Y
+        UM.append(np.unique(Y_train))
+        LenF = X_train[0].size # length of feature vector, X
+        NumClass = int(UM[0].size) # number of classes in the given train data 
+        Cfeats = {} # defining a dictionary for count of each feature
+        Cclass = {} # defining a dictionary for count of each class    
+        # constructing count of each features matrix then obtaining class probability and features probability
+        for ind in range(LenM):
+            if Y_train[ind] not in Cfeats:
+                Cfeats[Y_train[ind]] = [0 for j in range (LenF)]
+        for ind in range(LenM):
+            for con in range(LenF):
+                Cfeats[Y_train[ind]][con] += X_train[ind][con]
+        for ind in range(LenM):
+            if Y_train[ind] in Cclass:
+                Cclass[Y_train[ind]] += 1
+            else:
+                Cclass[Y_train[ind]] = 1
+        for CN in  Cfeats:	
+            YtrainSize=int(LenM)
+            temp = np.array([])
+            ClassPr.append(float((float((Cclass[CN] + Smooth)))/(float((YtrainSize + (NumClass * Smooth)))))) 
+            for i in range(LenF):
+                temp = np.append(temp , float(((Cfeats[CN][i] + Smooth))/float((Cclass[CN]+(2*Smooth)))))
+            FeatPr.append(temp)
+        PC = ClassPr
+        PF = FeatPr
+        NC = NumClass
+        NF = NumFeat
+        Output = np.array([])
+        for i in range(X_test.shape[0]):
+            ClassID = 0 # ID of Class
+            prob_max = -10**10 # maximum probability
+            prob = 0 #  probability of each working feature
+            for CN in range(NC):
+                prob = np.log(PC[CN])
+                for j in range(NF):
+                    Curent_Class_ID = X_test[i][j]
+                    if(Curent_Class_ID == 0):
+                        prob += np.log(1-PF[CN][j])
+                    else:
+                        prob += np.log(PF[CN][j])
+                if(prob > prob_max):
+                    prob_max = prob
+                    ClassID = CN
+            Output = np.append(Output , ClassID)
+        return Output
+
+    
+    
+                
@@ -0,0 +1,157 @@
+from NBSVMpreprocessing import create_bow, build_vocab
+import numpy as np
+from sklearn.svm import LinearSVC
+import time
+"""
+Naive Bayes Support Vector Machine interpolation, NBSVM.
+"""
+
+## tuning_params:
+gram = 2
+C = 100
+beta = 0.25
+alpha = 1
+
+
+"""
+Trains the Multinomial Naive Bayes Model
+"""
+def train_nb(vocab_list, df):
+    
+    #find prior = total positive examples/total examples 
+    total_sents = len(df['label'])
+    pos_sents = 0
+    neg_sents = 0
+    for i in range(len(df['label'])):
+        if(df['label'][i] == 1):
+            pos_sents += 1
+    neg_sents = total_sents - pos_sents
+    
+    #initiate counts for word appearance conditional on label == 1 and label == 0
+    #alpha is laplacian smoothing parameter
+    pos_list = np.ones(len(vocab_list)) * alpha
+    neg_list = np.ones(len(vocab_list)) * alpha
+    
+    for sentence, label in zip(df['sentence'], df['label']):
+        bow = create_bow(sentence, vocab_list, gram)
+      
+        if label == 1:
+            pos_list += bow
+        else:
+            neg_list += bow
+            
+    #Calculate log-count ratio
+    x = (pos_list/abs(pos_list).sum())
+    y = (neg_list/abs(neg_list).sum())
+    r = np.log(x/y)
+    b = np.log(pos_sents/neg_sents)
+    
+    return r, b
+
+"""
+Trains the (linear-kernel) SVM with L2 Regularization
+"""
+def train_svm(vocab_list, df_train, c, r):
+#    clf = LinearSVC(C=c, class_weight=None, dual=False, fit_intercept=True,
+#     loss='squared_hinge', max_iter=1000,
+#     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
+#     verbose=0)
+    print('creating SVM model')
+    clf = LinearSVC(C=c)
+    print('creating training matrix')
+    M = np.array([])
+    X = np.zeros((len(df_train['sentence']), len(vocab_list)))
+    bow = np.array([])
+    con = 0
+    for sentence in df_train['sentence']:
+        print('iteration: {}'.format(con+1))
+        bow = create_bow(sentence, vocab_list, gram)
+        M = r * bow
+        for i in range(len(M)):
+            X[con, i] = M[i]
+#       X.append(M)
+        con=con+1
+    #X = np.array([(r * create_bow(sentence, vocab_list, gram))  for sentence in df_train['sentence']])
+    y = df_train['label']
+   
+    clf.fit(X, y)   
+    svm_coef = clf.coef_
+    svm_intercept = clf.intercept_
+    
+    return svm_coef, svm_intercept, clf
+
+"""
+Predict classification with MNB
+"""
+def predict(df_test, w, b, vocab_list):
+    total_sents = len(df_test['label'])
+    total_score = 0
+    
+    for sentence, label in zip(df_test['sentence'], df_test['label']):      
+        bow = create_bow(sentence, vocab_list, gram)
+
+        result = np.sign(np.dot(bow, w.T) + b)
+        if result == -1:
+            result = 0
+        if result == label:
+            total_score +=1      
+            
+    return total_score/total_sents
+
+"""
+Predict classification with NB-SVM
+"""
+def predict_nbsvm(df_test, svm_coef, svm_intercept, r, b, vocab_list):
+    total_sents = len(df_test['label'])
+    total_score = 0
+    
+    for sentence, label in zip(df_test['sentence'], df_test['label']):
+        bow = r * create_bow(sentence, vocab_list, gram)  
+        w_bar = (abs(svm_coef).sum())/len(vocab_list)
+        w_prime = (1 - beta)*(w_bar) + (beta * svm_coef)
+        result = np.sign(np.dot(bow, w_prime.T) + svm_intercept)
+        if result == -1:
+            result = 0
+        if result == label:
+            total_score +=1  
+            
+    return total_score/total_sents
+
+
+    
+if __name__ == "__main__":
+    
+    time_first = time.time()
+    print("Building Dataset...")
+    vocab_list, df_train, df_val, df_test = build_vocab(gram)
+
+      
+    print("Training Multinomial Naive Bayes...")
+    r, b = train_nb(vocab_list, df_train)
+
+    #Train SVM
+    print("Training LinearSVM...")
+    svm_coef, svm_intercept, clf = train_svm(vocab_list, df_train, C, r)
+   
+  
+    #Test Models
+    print("Test using NBSVM ({:.4f}-gram):".format(gram))
+    accuracy = predict_nbsvm(df_val, svm_coef, svm_intercept, r, b, vocab_list)
+    print("Beta: {} Accuracy: {}".format(beta, accuracy))
+    
+    print("Test using MNB ({:.4f}-gram):".format(gram))
+    mnb_acc = predict(df_val, r, b, vocab_list)
+    print("Accuracy: {}".format(mnb_acc))
+    
+    
+
+
+    
+
+
+
+    
+
+    
+            
+            
@@ -0,0 +1,131 @@
+from nltk.corpus import stopwords
+import numpy as np
+import os
+from sklearn.model_selection import train_test_split
+
+
+def create_bow(sentence, vocab_list, gram):
+    word_list = tokenize(sentence, gram)
+    bow = np.zeros(len(vocab_list))
+    
+    for word in word_list:
+         if word in vocab_list:
+            bow[vocab_list[word]] = 1
+    return bow
+
+def rm_stopwords(word_list):
+    return [word for word in word_list if word not in stopwords.words('english')]
+
+def tokenize(sent, grams):
+    words_list = rm_stopwords(sent.split())
+    sent_tok = []
+    for gram in range(1, grams + 1):
+        for i in range(len(words_list) + 1 - gram):
+            sent_tok.append("-".join(words_list[i:i + gram]))
+    return sent_tok
+    
+"""
+Loads the raw data 
+"""
+def build_vocab(gram):
+    
+    #Load data
+    cwd = os.getcwd()
+    # the paths of data
+    data_path_training_neg = cwd + r'\train\neg'
+    data_path_training_pos = cwd + r'\train\pos'
+    data_path_test = cwd + r'\test'
+    File_names_neg =[];
+    File_names_pos =[];
+    File_names_test = []
+    
+    # training and test variable definition
+    Training_data_text=[]
+    Training_data_class=[]
+    Test_data_text=[]
+    
+    # reading neg training data from txt files
+    for root, dirs, files in os.walk(data_path_training_neg):
+        for name in files:
+            if name.endswith((".txt")):
+                File_names_neg.append(name)
+    File_names_sorted_neg = sorted(File_names_neg,key = lambda x: int(x.split('_')[0]))
+    for i in range(len(File_names_sorted_neg)):
+        txtfile_path = data_path_training_neg + '\\' + File_names_sorted_neg[i]
+        d = open(txtfile_path, 'r', encoding="utf8")
+        Training_data_text.append(d.read())
+        Training_data_class.append(0)
+        d.close()
+        
+    # reading pos training data from txt files
+    for root, dirs, files in os.walk(data_path_training_pos):
+        for name in files:
+            if name.endswith((".txt")):
+                File_names_pos.append(name)
+    File_names_sorted_pos = sorted(File_names_pos,key = lambda x: int(x.split('_')[0]))
+    for i in range(len(File_names_sorted_pos)):
+        txtfile_path = data_path_training_pos + '\\' + File_names_sorted_pos[i]
+        d = open(txtfile_path, 'r', encoding="utf8")
+        Training_data_text.append(d.read())
+        Training_data_class.append(1)
+        d.close()
+        
+    
+    # reading test data from txt files
+    for root, dirs, files in os.walk(data_path_test):
+        for name in files:
+            if name.endswith((".txt")):
+                File_names_test.append(name)
+    File_names_sorted = sorted(File_names_test)
+    for i in range(len(File_names_sorted)):
+        txtfile_path = data_path_test + '\\' + File_names_sorted[i]
+        d = open(txtfile_path, 'r', encoding="utf8")
+        Test_data_text.append(d.read())
+        d.close()
+    
+    # spliting train and valid data
+    X_train, X_val, Y_train, Y_val = train_test_split(Training_data_text, Training_data_class, train_size = 0.8, test_size = 0.2)
+    
+    # creating training data dictionary
+    Training_data_dic = {}
+    for i in range(len(X_train)):
+        Training_data_dic.setdefault('sentence', []).append(X_train[i])
+        Training_data_dic.setdefault('label', []).append(Y_train[i])
+    # creating validation data dictionary
+    Validation_data_dic = {}
+    for i in range(len(X_val)):
+        Validation_data_dic.setdefault('sentence', []).append(X_val[i])
+        Validation_data_dic.setdefault('label', []).append(Y_val[i])
+    # creating test data dictionary
+    Test_data_dic = {}
+    for i in range(len(Test_data_text)):
+        Test_data_dic.setdefault('sentence', []).append(Test_data_text[i])
+        
+    
+    # creating vocabulary dictionary
+    word_count = 0
+    vocab_list = {}
+    
+    #Create vocab set 
+    vocab_set = set()
+    for sentence in Training_data_dic['sentence']:
+        word_list = tokenize(sentence, gram)
+        word_list_reduced = []
+        for i in range(len(word_list)):
+          #  if word_list[i] not in vocab_set:
+                word_list_reduced.append(word_list[i])
+        vocab_set.update(word_list_reduced)
+    
+    #Assign each word a unique index
+    for word in vocab_set:
+        vocab_list[word] = word_count
+        word_count += 1
+    
+    df_train = Training_data_dic
+    df_val = Validation_data_dic
+    df_test = Test_data_dic
+    
+    return vocab_list, df_train, df_val, df_test
+
+
+