-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Mohammad Khoddam
authored
Apr 13, 2021
1 parent
d2ea2db
commit 071e4e1
Showing
3 changed files
with
2,209 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
# -*- coding: utf-8 -*- | ||
"""Keras-Final-Project.ipynb | ||
Automatically generated by Colaboratory. | ||
Original file is located at | ||
https://colab.research.google.com/drive/1-ZxA-xEilzc3kRseYVZfTh7IE-UUyZLc | ||
""" | ||
|
||
import sys | ||
import os | ||
import re | ||
|
||
import pandas as pd | ||
import numpy as np | ||
|
||
from sklearn.model_selection import train_test_split | ||
|
||
from keras.preprocessing import sequence | ||
from keras.models import Sequential | ||
from keras.layers.core import Dense, Dropout, Activation | ||
from keras.layers.embeddings import Embedding | ||
from keras.layers.recurrent import GRU | ||
from keras.preprocessing.text import Tokenizer | ||
from keras.layers.convolutional import Convolution1D | ||
from keras import backend as K | ||
|
||
!pip install hazm | ||
lineList = list() | ||
with open('stop_words.txt') as f: | ||
for line in f: | ||
lineList.append(line) | ||
|
||
lineList = [x.strip() for x in lineList] | ||
|
||
from google.colab import drive | ||
drive.mount('/content/drive') | ||
|
||
df = pd.read_csv('/content/drive/MyDrive/train.csv') | ||
|
||
from __future__ import unicode_literals | ||
from hazm import * | ||
import re, string | ||
|
||
def remove_stopwords(text): | ||
text = [word for word in text.split() if word not in lineList] | ||
return " ".join(text) | ||
|
||
normalizer = Normalizer() | ||
|
||
|
||
special = re.compile(r'\W') | ||
single = re.compile(r'\s+', flags=re.I) | ||
|
||
number = re.compile(r"[-+]?[0-9]+") | ||
pnumber = re.compile(r"[-+][\u06F0-\u06F90-9]+") | ||
url = re.compile(r"https?://\S+|www\.\S+") | ||
html = re.compile(r"<.*?>") | ||
emoji_pattern = re.compile( | ||
"[" | ||
u"\U0001F600-\U0001F64F" # emoticons | ||
u"\U0001F300-\U0001F5FF" # symbols & pictographs | ||
u"\U0001F680-\U0001F6FF" # transport & map symbols | ||
u"\U0001F1E0-\U0001F1FF" # flags (iOS) | ||
u"\U00002702-\U000027B0" | ||
u"\U000024C2-\U0001F251" | ||
"]+", | ||
flags=re.UNICODE, | ||
) | ||
|
||
|
||
df["Text"] = df.Text.map(remove_stopwords) | ||
df["Text"] = df.Text.map(lambda x: url.sub(r" ",x)) | ||
df["Text"] = df.Text.map(lambda x: html.sub(r" ",x)) | ||
df["Text"] = df.Text.map(lambda x: emoji_pattern.sub(r" ",x)) | ||
df["Text"] = df.Text.map(lambda x: number.sub(r" ",x)) | ||
df["Text"] = df.Text.map(lambda x: pnumber.sub(r" ",x)) | ||
df["Text"] = df.Text.map(lambda x: x.translate(str.maketrans(" ", " ", string.punctuation))) | ||
df["Text"] = df.Text.map(lambda x: special.sub(r" ",x)) | ||
df["Text"] = df.Text.map(lambda x: single.sub(r" ", x)) | ||
df["Text"] = df.Text.map(lambda x: normalizer.normalize(x)) | ||
|
||
df.Category = df.Category.astype('category') | ||
|
||
nb_classes = df.Category.cat.codes.max() + 1 | ||
|
||
df["CategoryCodes"] = df.Category.cat.codes | ||
|
||
X_train, X_test, y_train, y_test = train_test_split(df.Text, df.CategoryCodes, test_size=0.1, random_state=42) | ||
|
||
max_features = 20000 | ||
maxlen = 1000 | ||
|
||
tokenizer = Tokenizer(num_words=max_features) | ||
tokenizer.fit_on_texts(X_train) | ||
sequences_train = tokenizer.texts_to_sequences(X_train) | ||
sequences_test = tokenizer.texts_to_sequences(X_test) | ||
|
||
# max_features = len(tokenizer.word_index) + 1 | ||
# maxlen = max([len(s.split()) for s in df.Text]) | ||
|
||
X_train = sequence.pad_sequences(sequences_train, maxlen=maxlen, padding='post') | ||
X_test = sequence.pad_sequences(sequences_test, maxlen=maxlen, padding='post') | ||
|
||
|
||
# Y_train = np_utils.to_categorical(y_train, nb_classes) | ||
# Y_test = np_utils.to_categorical(y_test, nb_classes) | ||
|
||
|
||
print('X_train shape:', X_train.shape) | ||
print('X_test shape:', X_test.shape) | ||
|
||
batch_size = 32 | ||
|
||
from keras.callbacks import ModelCheckpoint | ||
|
||
model = Sequential() | ||
model.add(Embedding(max_features, 256, input_length=maxlen)) | ||
model.add(GRU(512, return_sequences=False)) | ||
# model.add(Dropout(0.5)) | ||
model.add(Dense(nb_classes, activation='sigmoid')) | ||
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) | ||
|
||
filepath="weights-improvement-{epoch:02d}-{val_accuracy:.2f}.hdf5" | ||
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max') | ||
callbacks_list = [checkpoint] | ||
|
||
|
||
model.fit(X_train, y_train, batch_size=batch_size, epochs=5, | ||
validation_data=(X_test, y_test), callbacks=callbacks_list) | ||
|
||
df_test = pd.read_csv('/content/drive/MyDrive/test.csv.zip') | ||
|
||
df_test["Text"] = df_test.Text.map(remove_stopwords) | ||
df_test["Text"] = df_test.Text.map(lambda x: url.sub(r" ",x)) | ||
df_test["Text"] = df_test.Text.map(lambda x: html.sub(r" ",x)) | ||
df_test["Text"] = df_test.Text.map(lambda x: emoji_pattern.sub(r" ",x)) | ||
df_test["Text"] = df_test.Text.map(lambda x: number.sub(r" ",x)) | ||
df_test["Text"] = df_test.Text.map(lambda x: pnumber.sub(r" ",x)) | ||
df_test["Text"] = df_test.Text.map(lambda x: x.translate(str.maketrans(" ", " ", string.punctuation))) | ||
df_test["Text"] = df_test.Text.map(lambda x: special.sub(r" ",x)) | ||
df_test["Text"] = df_test.Text.map(lambda x: single.sub(r" ", x)) | ||
df_test["Text"] = df_test.Text.map(lambda x: normalizer.normalize(x)) | ||
|
||
sequences_final = tokenizer.texts_to_sequences(df_test.Text) | ||
X_final = sequence.pad_sequences(sequences_final, maxlen=maxlen, padding='post') | ||
model.load_weights("/content/weights-improvement-02-0.84.hdf5") | ||
Y_final = model.predict(X_final) | ||
Y_classes = Y_final.argmax(axis=-1) | ||
|
||
# df = pd.read_csv('/content/drive/MyDrive/train.csv') | ||
# df.Category = df.Category.astype('category') | ||
|
||
Y_classes_Name = df.Category.cat.categories[Y_classes] | ||
|
||
df_test = df_test.rename(columns = {"Text":"Category"}) | ||
df_test.Category = Y_classes_Name | ||
|
||
df_test.to_csv("Final.csv", index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,205 @@ | ||
# -*- coding: utf-8 -*- | ||
"""SKlearn-Final-Project.ipynb | ||
Automatically generated by Colaboratory. | ||
Original file is located at | ||
https://colab.research.google.com/drive/1_Dk0kPlBe0zjNb3eV-Z06SPpG3b1mYmw | ||
""" | ||
|
||
!pip install hazm | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import re | ||
import nltk | ||
from sklearn.datasets import load_files | ||
nltk.download('stopwords') | ||
import pickle | ||
from nltk.corpus import stopwords | ||
|
||
from google.colab import drive | ||
drive.mount('/content/drive') | ||
|
||
df = pd.read_csv('/content/drive/MyDrive/train.csv') | ||
df_test = pd.read_csv('/content/drive/MyDrive/test.csv.zip') | ||
|
||
X, Y = df.Text, df.Category | ||
|
||
# lineList = list() | ||
# with open('stop_words.txt') as f: | ||
# for line in f: | ||
# lineList.append(line) | ||
|
||
# lineList = [x.strip() for x in lineList] | ||
|
||
df_test | ||
|
||
# from __future__ import unicode_literals | ||
# from hazm import * | ||
# import re, string | ||
|
||
# def remove_stopwords(text): | ||
# text = [word for word in text.split() if word not in lineList] | ||
# return " ".join(text) | ||
|
||
# normalizer = Normalizer() | ||
|
||
|
||
# special = re.compile(r'\W') | ||
# single = re.compile(r'\s+', flags=re.I) | ||
|
||
# number = re.compile(r"[-+]?[0-9]+") | ||
# pnumber = re.compile(r"[-+][\u06F0-\u06F90-9]+") | ||
# url = re.compile(r"https?://\S+|www\.\S+") | ||
# html = re.compile(r"<.*?>") | ||
# emoji_pattern = re.compile( | ||
# "[" | ||
# u"\U0001F600-\U0001F64F" # emoticons | ||
# u"\U0001F300-\U0001F5FF" # symbols & pictographs | ||
# u"\U0001F680-\U0001F6FF" # transport & map symbols | ||
# u"\U0001F1E0-\U0001F1FF" # flags (iOS) | ||
# u"\U00002702-\U000027B0" | ||
# u"\U000024C2-\U0001F251" | ||
# "]+", | ||
# flags=re.UNICODE, | ||
# ) | ||
|
||
|
||
# # df["Text"] = df.Text.map(remove_stopwords) | ||
# # df["Text"] = df.Text.map(lambda x: url.sub(r" ",x)) | ||
# # df["Text"] = df.Text.map(lambda x: html.sub(r" ",x)) | ||
# # df["Text"] = df.Text.map(lambda x: emoji_pattern.sub(r" ",x)) | ||
# # df["Text"] = df.Text.map(lambda x: number.sub(r" ",x)) | ||
# # df["Text"] = df.Text.map(lambda x: pnumber.sub(r" ",x)) | ||
# df["Text"] = df.Text.map(lambda x: x.translate(str.maketrans(" ", " ", string.punctuation))) | ||
# df["Text"] = df.Text.map(lambda x: special.sub(r" ",x)) | ||
# df["Text"] = df.Text.map(lambda x: single.sub(r" ", x)) | ||
# # df["Text"] = df.Text.map(lambda x: normalizer.normalize(x)) | ||
|
||
|
||
# # df_test["Text"] = df_test.Text.map(remove_stopwords) | ||
# # df_test["Text"] = df_test.Text.map(lambda x: url.sub(r" ",x)) | ||
# # df_test["Text"] = df_test.Text.map(lambda x: html.sub(r" ",x)) | ||
# # df_test["Text"] = df_test.Text.map(lambda x: emoji_pattern.sub(r" ",x)) | ||
# # df_test["Text"] = df_test.Text.map(lambda x: number.sub(r" ",x)) | ||
# # df_test["Text"] = df_test.Text.map(lambda x: pnumber.sub(r" ",x)) | ||
# df_test["Text"] = df_test.Text.map(lambda x: x.translate(str.maketrans(" ", " ", string.punctuation))) | ||
# df_test["Text"] = df_test.Text.map(lambda x: special.sub(r" ",x)) | ||
# df_test["Text"] = df_test.Text.map(lambda x: single.sub(r" ", x)) | ||
# # df_test["Text"] = df_test.Text.map(lambda x: normalizer.normalize(x)) | ||
|
||
|
||
# # for i in range(df_texts_size): | ||
# # bound = len(df_texts_hazm[i].split(' ')) - 1 | ||
# # for char in "0123456789۰۱۲۳۴۵۶۷۸۹!#$%&().*+,،-/:;<=>?@[\\]^_`{|}~\t\n": | ||
# # df_texts_hazm[i] = df_texts_hazm[i].replace(char, ' ', bound) | ||
# # # for word in lineList: | ||
# # # df_texts_hazm[i] = df_texts_hazm[i].replace(word, ' ', bound) | ||
|
||
|
||
# # df_texts_hazm[i] = url.sub(r"",df_texts_hazm[i]) | ||
# # df_texts_hazm[i] = html.sub(r"",df_texts_hazm[i]) | ||
# # df_texts_hazm[i] = emoji_pattern.sub(r"",df_texts_hazm[i]) | ||
|
||
from sklearn.model_selection import train_test_split | ||
X_train, X_test, y_train, y_test = train_test_split(df.Text, df.Category, test_size=0.1, random_state=42) | ||
|
||
from sklearn.feature_extraction.text import TfidfVectorizer | ||
# tfidfconverter = TfidfVectorizer(max_features=1500, min_df=3, max_df=0.8) | ||
tfidfconverter = TfidfVectorizer() | ||
|
||
X = tfidfconverter.fit_transform(X_train) | ||
|
||
from sklearn.ensemble import RandomForestClassifier | ||
from sklearn.linear_model import SGDClassifier | ||
from sklearn.svm import LinearSVC, LinearSVR | ||
from sklearn.svm import SVC, NuSVC | ||
from sklearn.model_selection import GridSearchCV | ||
from sklearn.naive_bayes import GaussianNB | ||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis | ||
from sklearn.multiclass import OneVsRestClassifier | ||
from xgboost import XGBClassifier | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn.neighbors import KNeighborsClassifier | ||
from sklearn.ensemble import VotingClassifier | ||
from sklearn.pipeline import Pipeline | ||
from sklearn.naive_bayes import MultinomialNB | ||
from sklearn.preprocessing import StandardScaler | ||
from sklearn.linear_model import PassiveAggressiveClassifier | ||
|
||
|
||
# parameters = {'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)} | ||
|
||
# classifier = GaussianNB() | ||
# classifier = GridSearchCV(classifier, parameters, n_jobs=-1) | ||
# classifier = RandomForestClassifier(n_estimators=200, random_state=0) | ||
# classifier = RandomForestClassifier(n_estimators=100, criterion='gini',max_depth=10, random_state=0, max_features=None) | ||
# classifier_sgc = SGDClassifier(loss="modified_huber", penalty="l2", max_iter=5000) | ||
|
||
# classifier_linsvc = SVC(kernel='linear',probability=True) | ||
# classifier_logreg = LogisticRegression(max_iter=1000) | ||
# classifier = KNeighborsClassifier(n_neighbors = 3) | ||
# classifier = SVC() | ||
# classifier = OneVsRestClassifier(SVC(), n_jobs=-1) | ||
# classifier = XGBClassifier() | ||
# classifier = LinearSVC(loss="hinge", max_iter=10000, C=3) | ||
|
||
|
||
# classifier = NuSVC(probability=True) | ||
# classifier = LinearDiscriminantAnalysis() | ||
|
||
# classifier_sgc.fit(X, y_train) | ||
# classifier_linsvc.fit(X, y_train) | ||
# classifier_logreg.fit(X, y_train) | ||
|
||
# estimators=[('sgc', classifier_sgc), ('svc', classifier_linsvc)] | ||
|
||
# classifier = VotingClassifier(estimators, voting='hard', n_jobs=-1) | ||
|
||
# classifier = classifier_sgc | ||
|
||
# classifier = NBSVM(C=0.001) | ||
|
||
# classifier.fit(X, y_train) | ||
|
||
# classifier = PassiveAggressiveClassifier() | ||
|
||
# pipe = Pipeline(steps = [("tfidf_vectorization", TfidfVectorizer()), ("classifier", MultinomialNB())]) | ||
|
||
# search_space = [{"classifier": [MultinomialNB()]}, | ||
# {"classifier": [LinearSVC()]}, | ||
# {"classifier": [PassiveAggressiveClassifier()]}, | ||
# {"classifier": [LogisticRegression()], | ||
# "classifier__solver": ["liblinear"]}, | ||
# {"classifier": [KNeighborsClassifier()], | ||
# "classifier__n_neighbors": [6,7,8]}] | ||
|
||
# classifier = GridSearchCV(estimator=pipe, param_grid=search_space, cv=10, return_train_score=True, n_jobs=-1, refit="AUC") | ||
|
||
|
||
#Two Best Models: | ||
classifier = SGDClassifier(loss="modified_huber", max_iter=100000, alpha=0.00001) | ||
# classifier = LinearSVC() | ||
|
||
classifier.fit(X, y_train) | ||
|
||
X = tfidfconverter.transform(X_test) | ||
y_pred = classifier.predict(X) | ||
|
||
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score | ||
print(confusion_matrix(y_test,y_pred)) | ||
print(classification_report(y_test,y_pred)) | ||
print(accuracy_score(y_test, y_pred)) | ||
|
||
X = tfidfconverter.transform(df_test.Text) | ||
Y_final = classifier.predict(X) | ||
df_test_res = df_test.copy() | ||
df_test_res.Text = Y_final | ||
df_test_res = df_test_res.rename(columns = {"Text":"Category"}) | ||
df_test_res.to_csv("Final.csv", index=False) | ||
|
||
with open('text_classifier', 'wb') as picklefile: | ||
pickle.dump(classifier,picklefile) | ||
|
||
!ls |
Oops, something went wrong.