-
Notifications
You must be signed in to change notification settings - Fork 39
/
trump_predictor.py
97 lines (84 loc) · 2.19 KB
/
trump_predictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
import csv
import pickle
import collections
import numpy as np
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from keras.preprocessing import sequence
from keras.preprocessing import text
from keras.models import load_model
MODELS_DIR = "models"
DATA_DIR = "data"
TRUMP_TWEETS_PATH = os.path.join(DATA_DIR, "trumptweets.csv")
DIMENSIONS = ["IE", "NS", "FT", "PJ"]
MODEL_BATCH_SIZE = 128
TOP_WORDS = 2500
MAX_POST_LENGTH = 40
EMBEDDING_VECTOR_LENGTH = 20
final = ""
x_test = []
with open(TRUMP_TWEETS_PATH, "r", encoding="ISO-8859-1") as f:
reader = csv.reader(f)
for row in f:
x_test.append(row)
types = [
"INFJ",
"ENTP",
"INTP",
"INTJ",
"ENTJ",
"ENFJ",
"INFP",
"ENFP",
"ISFP",
"ISTP",
"ISFJ",
"ISTJ",
"ESTP",
"ESFP",
"ESTJ",
"ESFJ",
]
types = [x.lower() for x in types]
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words("english")
def lemmatize(x):
lemmatized = []
for post in x:
temp = post.lower()
for type_ in types:
temp = temp.replace(" " + type_, "")
temp = " ".join(
[
lemmatizer.lemmatize(word)
for word in temp.split(" ")
if (word not in stop_words)
]
)
lemmatized.append(temp)
return np.array(lemmatized)
for k in range(len(DIMENSIONS)):
model = load_model(
os.path.join(MODELS_DIR, "rnn_model_{}.h5".format(DIMENSIONS[k]))
)
tokenizer = None
with open(
os.path.join(MODELS_DIR, "rnn_tokenizer_{}.pkl".format(DIMENSIONS[k])), "rb"
) as f:
tokenizer = pickle.load(f)
def preprocess(x):
lemmatized = lemmatize(x)
tokenized = tokenizer.texts_to_sequences(lemmatized)
return sequence.pad_sequences(tokenized, maxlen=MAX_POST_LENGTH)
predictions = model.predict(preprocess(x_test))
prediction = float(sum(predictions) / len(predictions))
print(DIMENSIONS[k])
print(prediction)
if prediction >= 0.5:
final += DIMENSIONS[k][1]
else:
final += DIMENSIONS[k][0]
print("")
print("Final prediction: {}".format(final))