Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
Mohammad Khoddam authored Apr 13, 2021
1 parent d2ea2db commit 071e4e1
Show file tree
Hide file tree
Showing 3 changed files with 2,209 additions and 0 deletions.
159 changes: 159 additions & 0 deletions keras_final_project.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# -*- coding: utf-8 -*-
"""Keras-Final-Project.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1-ZxA-xEilzc3kRseYVZfTh7IE-UUyZLc
"""

import sys
import os
import re

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.preprocessing.text import Tokenizer
from keras.layers.convolutional import Convolution1D
from keras import backend as K

!pip install hazm
lineList = list()
with open('stop_words.txt') as f:
for line in f:
lineList.append(line)

lineList = [x.strip() for x in lineList]

from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/MyDrive/train.csv')

from __future__ import unicode_literals
from hazm import *
import re, string

def remove_stopwords(text):
text = [word for word in text.split() if word not in lineList]
return " ".join(text)

normalizer = Normalizer()


special = re.compile(r'\W')
single = re.compile(r'\s+', flags=re.I)

number = re.compile(r"[-+]?[0-9]+")
pnumber = re.compile(r"[-+][\u06F0-\u06F90-9]+")
url = re.compile(r"https?://\S+|www\.\S+")
html = re.compile(r"<.*?>")
emoji_pattern = re.compile(
"["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+",
flags=re.UNICODE,
)


df["Text"] = df.Text.map(remove_stopwords)
df["Text"] = df.Text.map(lambda x: url.sub(r" ",x))
df["Text"] = df.Text.map(lambda x: html.sub(r" ",x))
df["Text"] = df.Text.map(lambda x: emoji_pattern.sub(r" ",x))
df["Text"] = df.Text.map(lambda x: number.sub(r" ",x))
df["Text"] = df.Text.map(lambda x: pnumber.sub(r" ",x))
df["Text"] = df.Text.map(lambda x: x.translate(str.maketrans(" ", " ", string.punctuation)))
df["Text"] = df.Text.map(lambda x: special.sub(r" ",x))
df["Text"] = df.Text.map(lambda x: single.sub(r" ", x))
df["Text"] = df.Text.map(lambda x: normalizer.normalize(x))

df.Category = df.Category.astype('category')

nb_classes = df.Category.cat.codes.max() + 1

df["CategoryCodes"] = df.Category.cat.codes

X_train, X_test, y_train, y_test = train_test_split(df.Text, df.CategoryCodes, test_size=0.1, random_state=42)

max_features = 20000
maxlen = 1000

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test = tokenizer.texts_to_sequences(X_test)

# max_features = len(tokenizer.word_index) + 1
# maxlen = max([len(s.split()) for s in df.Text])

X_train = sequence.pad_sequences(sequences_train, maxlen=maxlen, padding='post')
X_test = sequence.pad_sequences(sequences_test, maxlen=maxlen, padding='post')


# Y_train = np_utils.to_categorical(y_train, nb_classes)
# Y_test = np_utils.to_categorical(y_test, nb_classes)


print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

batch_size = 32

from keras.callbacks import ModelCheckpoint

model = Sequential()
model.add(Embedding(max_features, 256, input_length=maxlen))
model.add(GRU(512, return_sequences=False))
# model.add(Dropout(0.5))
model.add(Dense(nb_classes, activation='sigmoid'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

filepath="weights-improvement-{epoch:02d}-{val_accuracy:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]


model.fit(X_train, y_train, batch_size=batch_size, epochs=5,
validation_data=(X_test, y_test), callbacks=callbacks_list)

df_test = pd.read_csv('/content/drive/MyDrive/test.csv.zip')

df_test["Text"] = df_test.Text.map(remove_stopwords)
df_test["Text"] = df_test.Text.map(lambda x: url.sub(r" ",x))
df_test["Text"] = df_test.Text.map(lambda x: html.sub(r" ",x))
df_test["Text"] = df_test.Text.map(lambda x: emoji_pattern.sub(r" ",x))
df_test["Text"] = df_test.Text.map(lambda x: number.sub(r" ",x))
df_test["Text"] = df_test.Text.map(lambda x: pnumber.sub(r" ",x))
df_test["Text"] = df_test.Text.map(lambda x: x.translate(str.maketrans(" ", " ", string.punctuation)))
df_test["Text"] = df_test.Text.map(lambda x: special.sub(r" ",x))
df_test["Text"] = df_test.Text.map(lambda x: single.sub(r" ", x))
df_test["Text"] = df_test.Text.map(lambda x: normalizer.normalize(x))

sequences_final = tokenizer.texts_to_sequences(df_test.Text)
X_final = sequence.pad_sequences(sequences_final, maxlen=maxlen, padding='post')
model.load_weights("/content/weights-improvement-02-0.84.hdf5")
Y_final = model.predict(X_final)
Y_classes = Y_final.argmax(axis=-1)

# df = pd.read_csv('/content/drive/MyDrive/train.csv')
# df.Category = df.Category.astype('category')

Y_classes_Name = df.Category.cat.categories[Y_classes]

df_test = df_test.rename(columns = {"Text":"Category"})
df_test.Category = Y_classes_Name

df_test.to_csv("Final.csv", index=False)
205 changes: 205 additions & 0 deletions sklearn_final_project.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
# -*- coding: utf-8 -*-
"""SKlearn-Final-Project.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1_Dk0kPlBe0zjNb3eV-Z06SPpG3b1mYmw
"""

!pip install hazm

import numpy as np
import pandas as pd
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
import pickle
from nltk.corpus import stopwords

from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv('/content/drive/MyDrive/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/test.csv.zip')

X, Y = df.Text, df.Category

# lineList = list()
# with open('stop_words.txt') as f:
# for line in f:
# lineList.append(line)

# lineList = [x.strip() for x in lineList]

df_test

# from __future__ import unicode_literals
# from hazm import *
# import re, string

# def remove_stopwords(text):
# text = [word for word in text.split() if word not in lineList]
# return " ".join(text)

# normalizer = Normalizer()


# special = re.compile(r'\W')
# single = re.compile(r'\s+', flags=re.I)

# number = re.compile(r"[-+]?[0-9]+")
# pnumber = re.compile(r"[-+][\u06F0-\u06F90-9]+")
# url = re.compile(r"https?://\S+|www\.\S+")
# html = re.compile(r"<.*?>")
# emoji_pattern = re.compile(
# "["
# u"\U0001F600-\U0001F64F" # emoticons
# u"\U0001F300-\U0001F5FF" # symbols & pictographs
# u"\U0001F680-\U0001F6FF" # transport & map symbols
# u"\U0001F1E0-\U0001F1FF" # flags (iOS)
# u"\U00002702-\U000027B0"
# u"\U000024C2-\U0001F251"
# "]+",
# flags=re.UNICODE,
# )


# # df["Text"] = df.Text.map(remove_stopwords)
# # df["Text"] = df.Text.map(lambda x: url.sub(r" ",x))
# # df["Text"] = df.Text.map(lambda x: html.sub(r" ",x))
# # df["Text"] = df.Text.map(lambda x: emoji_pattern.sub(r" ",x))
# # df["Text"] = df.Text.map(lambda x: number.sub(r" ",x))
# # df["Text"] = df.Text.map(lambda x: pnumber.sub(r" ",x))
# df["Text"] = df.Text.map(lambda x: x.translate(str.maketrans(" ", " ", string.punctuation)))
# df["Text"] = df.Text.map(lambda x: special.sub(r" ",x))
# df["Text"] = df.Text.map(lambda x: single.sub(r" ", x))
# # df["Text"] = df.Text.map(lambda x: normalizer.normalize(x))


# # df_test["Text"] = df_test.Text.map(remove_stopwords)
# # df_test["Text"] = df_test.Text.map(lambda x: url.sub(r" ",x))
# # df_test["Text"] = df_test.Text.map(lambda x: html.sub(r" ",x))
# # df_test["Text"] = df_test.Text.map(lambda x: emoji_pattern.sub(r" ",x))
# # df_test["Text"] = df_test.Text.map(lambda x: number.sub(r" ",x))
# # df_test["Text"] = df_test.Text.map(lambda x: pnumber.sub(r" ",x))
# df_test["Text"] = df_test.Text.map(lambda x: x.translate(str.maketrans(" ", " ", string.punctuation)))
# df_test["Text"] = df_test.Text.map(lambda x: special.sub(r" ",x))
# df_test["Text"] = df_test.Text.map(lambda x: single.sub(r" ", x))
# # df_test["Text"] = df_test.Text.map(lambda x: normalizer.normalize(x))


# # for i in range(df_texts_size):
# # bound = len(df_texts_hazm[i].split(' ')) - 1
# # for char in "0123456789۰۱۲۳۴۵۶۷۸۹!#$%&().*+,،-/:;<=>?@[\\]^_`{|}~\t\n":
# # df_texts_hazm[i] = df_texts_hazm[i].replace(char, ' ', bound)
# # # for word in lineList:
# # # df_texts_hazm[i] = df_texts_hazm[i].replace(word, ' ', bound)


# # df_texts_hazm[i] = url.sub(r"",df_texts_hazm[i])
# # df_texts_hazm[i] = html.sub(r"",df_texts_hazm[i])
# # df_texts_hazm[i] = emoji_pattern.sub(r"",df_texts_hazm[i])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.Text, df.Category, test_size=0.1, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer
# tfidfconverter = TfidfVectorizer(max_features=1500, min_df=3, max_df=0.8)
tfidfconverter = TfidfVectorizer()

X = tfidfconverter.fit_transform(X_train)

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.svm import SVC, NuSVC
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import PassiveAggressiveClassifier


# parameters = {'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

# classifier = GaussianNB()
# classifier = GridSearchCV(classifier, parameters, n_jobs=-1)
# classifier = RandomForestClassifier(n_estimators=200, random_state=0)
# classifier = RandomForestClassifier(n_estimators=100, criterion='gini',max_depth=10, random_state=0, max_features=None)
# classifier_sgc = SGDClassifier(loss="modified_huber", penalty="l2", max_iter=5000)

# classifier_linsvc = SVC(kernel='linear',probability=True)
# classifier_logreg = LogisticRegression(max_iter=1000)
# classifier = KNeighborsClassifier(n_neighbors = 3)
# classifier = SVC()
# classifier = OneVsRestClassifier(SVC(), n_jobs=-1)
# classifier = XGBClassifier()
# classifier = LinearSVC(loss="hinge", max_iter=10000, C=3)


# classifier = NuSVC(probability=True)
# classifier = LinearDiscriminantAnalysis()

# classifier_sgc.fit(X, y_train)
# classifier_linsvc.fit(X, y_train)
# classifier_logreg.fit(X, y_train)

# estimators=[('sgc', classifier_sgc), ('svc', classifier_linsvc)]

# classifier = VotingClassifier(estimators, voting='hard', n_jobs=-1)

# classifier = classifier_sgc

# classifier = NBSVM(C=0.001)

# classifier.fit(X, y_train)

# classifier = PassiveAggressiveClassifier()

# pipe = Pipeline(steps = [("tfidf_vectorization", TfidfVectorizer()), ("classifier", MultinomialNB())])

# search_space = [{"classifier": [MultinomialNB()]},
# {"classifier": [LinearSVC()]},
# {"classifier": [PassiveAggressiveClassifier()]},
# {"classifier": [LogisticRegression()],
# "classifier__solver": ["liblinear"]},
# {"classifier": [KNeighborsClassifier()],
# "classifier__n_neighbors": [6,7,8]}]

# classifier = GridSearchCV(estimator=pipe, param_grid=search_space, cv=10, return_train_score=True, n_jobs=-1, refit="AUC")


#Two Best Models:
classifier = SGDClassifier(loss="modified_huber", max_iter=100000, alpha=0.00001)
# classifier = LinearSVC()

classifier.fit(X, y_train)

X = tfidfconverter.transform(X_test)
y_pred = classifier.predict(X)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

X = tfidfconverter.transform(df_test.Text)
Y_final = classifier.predict(X)
df_test_res = df_test.copy()
df_test_res.Text = Y_final
df_test_res = df_test_res.rename(columns = {"Text":"Category"})
df_test_res.to_csv("Final.csv", index=False)

with open('text_classifier', 'wb') as picklefile:
pickle.dump(classifier,picklefile)

!ls
Loading

0 comments on commit 071e4e1

Please sign in to comment.