-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
136 lines (100 loc) · 3.54 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn import metrics
from scipy import stats
import pandas as pd
import warnings
import time
# Read data from IMDb.csv file
data = pd.read_csv('mylib/IMDb.csv')
print(data)
data.drop('id', axis=1)
# list all the useful columns from IMDb file
useful_columns = [
'castTotalLikes',
'directorLikes',
'actor1likes',
'movieLikes',
'fbPosters',
'year',
'duration',
'genre',
'contentRating',
'criticReviews',
'userReviews',
'userVotes',
'rating'
]
# save the useful data
data = data[useful_columns]
# remove null values from data
data = data.dropna()
# grab the columns where data is > or <= than 1990
column1 = data[data.year > 1990]
column2 = data[data.year <= 1990]
# right data to IMDb_likes_review and grab from IMDB_new
data.to_csv('mylib/IMDb_likes_review.csv', sep=',')
data = pd.read_csv('mylib/IMDB_new.csv', sep=',')
# remove id
data = data.drop('id', axis=1)
# print in histograms the distribution of the data
pd.DataFrame.hist(data, figsize=[15, 15])
# plt.show()
################### LINEAR REGRESSION MODEL ###################
# loading x value to be used in the model
x = data.drop(['rating', 'movieLikes', 'directorLikes', 'genre', 'castTotalLikes', 'actor1likes'], axis=1)
# get the ratings column from the data for the test model
y = data.rating
# train the data as using 40% as test
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0, test_size=0.4)
start = time.time()
# run the data with Ordinary Least Squares regression(OLS) passing the training
linear_model_regression = sm.OLS(y_train, X_train)
result = linear_model_regression.fit()
end = time.time()
# print the summary of the results received
with open('OLS_result.txt', 'w') as f:
f.write(result.summary().as_text())
# Calculate the total prediction and accuracy of Linear Regression
X_accuracy = pd.DataFrame(result.predict(X_test))
X_accuracy_round = X_accuracy.round(1)
Y_accuracy = pd.DataFrame(y_test)
Y_accuracy = Y_accuracy.reset_index(drop=True)
X_accuracy_round['rating'] = Y_accuracy
X_accuracy_round.columns = ['pred', 'actual']
X_accuracy_round['difference'] = round(abs(X_accuracy_round.pred - X_accuracy_round.actual), 2)
print(f"Total Predictions: {X_accuracy_round.difference.count()}")
print(f"Accuracy of Linear Regression:")
print((X_accuracy_round.difference < 1.1).sum()/X_accuracy_round.difference.count())
print(f"Time execution: {end - start} sec")
################### K-NEAREST NEIGHBOUR MODEL ###################
knn_data = data
rate = [0.0, 2.5, 5.0, 7.5, 10.0]
grade = ['VeryBad', 'Bad', 'Good', 'VeryGood']
knn_data['grades'] = pd.cut(knn_data.rating, rate, labels=grade)
print(knn_data)
x = knn_data.drop(['rating', 'grades'], axis=1)
y = knn_data.grades
X_train, X_test, y_train, y_test = train_test_split(x, y, random_state=0, test_size=0.4)
k_range = range(10, 70)
score = []
start = time.time()
for i in k_range:
warnings.filterwarnings("ignore")
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
prediction = knn.predict(X_test)
accuracy = metrics.accuracy_score(y_test, prediction)
score.append(accuracy)
end = time.time()
final_score = 0
for i in range(len(score)):
final_score += score[i]
final_score = final_score/len(score)
print(f"accuracy: {final_score} | time execution:{end - start} sec")
plt.plot(k_range, score)
plt.xlabel('Value of K')
plt.ylabel('Accuracy')
# plt.show()