-
Notifications
You must be signed in to change notification settings - Fork 0
/
ANALISE_SUPERA.R
227 lines (163 loc) · 7.82 KB
/
ANALISE_SUPERA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#------------entering the database for analysis-----------------------------------------
# include the archive in "C:/bechdel.xlsx"
#install.packages("readxl")# in case of necessary!
library(readxl)#acessing the library for readering of the archive .xlsx
# Part 1: Data setup
bechdel <- data.frame(read_excel("C:/bechdel.xlsx")[-1,1:10])
bechdel <- bechdel[order(bechdel$Year),] #sorting the data set
head(bechdel)
year <- as.numeric(bechdel$Year);table(year);length(year) #generates the "year" variable
order(year)
sort(year)
imdb <- as.character(bechdel$IMDB.code)#generates the "imdb" variable
# construct IMDB url
IMDB <- 0
for(i in 1:dim(bechdel)[1]){
IMDB[i] <- paste0("http://www.imdb.com/title/",imdb[i])
}
title <- "Movie test" #generates the title
budget <- as.numeric(bechdel$Budget) #generates the "budget" variable
domgross <- as.numeric(bechdel$Dom.gross) #contain (NAs); generates the "domgross" variable
intgross <- as.numeric(bechdel$Int.Gross) #contain (NAs); generates the "intgross" variable
period_code <- as.character(bechdel$Code) #generates the "period_code" variable
budget_2013 <- as.numeric(bechdel$budget.2013.dollars) #generates the "budget_2013" variable
domgross_2013 <- as.numeric(bechdel$Dom.gross.2013.dollars) #contain (NAs); generates the "domgross_2013" variable
intgross_2013 <- as.numeric(bechdel$Int.gross.2013.dollars) #contain (NAs); generates the "intgross_2013" variable
binary <- bechdel$Simple.pass...fail
for(i in 1:length(binary)){
if(binary[i] == "PASS"){binary[i] = as.numeric(1)}
else{binary[i] = as.numeric(0)}
} # assign value "0" to "FAIL" and "1" to "PASS"
binary <- as.numeric(binary) # convert to numeric the value 0 and 1
table(binary) # showing the table of binary
#Part 2: Data analysis
# i) Summarise the data
lista <- list(year,budget,domgross,intgross,budget_2013,
domgross_2013,intgross_2013,binary)
summarise <- list(year=0,budget=0,domgross=0,intgross=0,
budget_2013=0,domgross_2013=0,intgross_2013=0,binary=0)
for(i in 1:length(lista)){
summarise[[i]] <- summary(lista[[i]])
}
summarise$year
par(mfrow=c(2,4))
summarise$budget;boxplot(summarise$budget,col="grey",main="Budget")
sd(budget) #standart deviation to budget
summarise$domgross;boxplot(summarise$domgross, col="blue",main="Domgross")
sd(domgross) #standart deviation to domgross
summarise$intgross;boxplot(summarise$intgross, col="green",main="Intgross")
sd(intgross) #standart deviation to intgross
summarise$budget_2013;boxplot(summarise$budget_2013, col="red",main="Budget 2013")
sd(budget_2013) #standart deviation to budget_2013
summarise$domgross_2013;boxplot(summarise$domgross_2013, col="purple",main="Domgross 2013")
sd(domgross_2013) #standart deviation to domgross_2013
summarise$intgross_2013;boxplot(summarise$intgross_2013, col="pink",main="Intgross 2013")
sd(intgross_2013) #standart deviation to intgross_2013
dataframe <- data.frame(year,budget,domgross,intgross,budget_2013,
domgross_2013,intgross_2013,binary)
par(mfrow=c(1,1))
P_PASS_1970_1975 = sum(dataframe[year >= 1970 & year < 1975,]$binary== 1)/
length(dataframe[year >= 1970 & year < 1975,]$binary== 1)
P_FAIL_1970_1975 = 1-P_PASS_1970_1975
P_PASS_1975_1980 = sum(dataframe[year >= 1975 & year < 1980,]$binary== 1)/
length(dataframe[year >= 1975 & year < 1980,]$binary== 1)
P_FAIL_1975_1980 = 1-P_PASS_1975_1980
P_PASS_1980_1985 = sum(dataframe[year >= 1980 & year < 1985,]$binary== 1)/
length(dataframe[year >= 1980 & year < 1985,]$binary== 1)
P_FAIL_1980_1985 = 1-P_PASS_1980_1985
P_PASS_1985_1990 = sum(dataframe[year >= 1985 & year < 1990,]$binary== 1)/
length(dataframe[year >= 1985 & year < 1990,]$binary== 1)
P_FAIL_1985_1990 = 1-P_PASS_1985_1990
P_PASS_1990_1995 = sum(dataframe[year >= 1990 & year < 1995,]$binary== 1)/
length(dataframe[year >= 1990 & year < 1995,]$binary== 1)
P_FAIL_1990_1995 = 1-P_PASS_1990_1995
P_PASS_1995_2000 = sum(dataframe[year >= 1995 & year < 2000,]$binary== 1)/
length(dataframe[year >= 1995 & year < 2000,]$binary== 1)
P_FAIL_1995_2000 = 1-P_PASS_1995_2000
P_PASS_2000_2005 = sum(dataframe[year >= 2000 & year < 2005,]$binary== 1)/
length(dataframe[year >= 2000 & year < 2005,]$binary== 1)
P_FAIL_2000_2005 = 1-P_PASS_2000_2005
P_PASS_2005_2010 = sum(dataframe[year >= 2005 & year < 2010,]$binary== 1)/
length(dataframe[year >= 2005 & year < 2010,]$binary== 1)
P_FAIL_2005_2010 = 1-P_PASS_2005_2010
P_PASS_2010_2015 = sum(dataframe[year >= 2010 & year <= 2015,]$binary== 1)/
length(dataframe[year >= 2010 & year <= 2015,]$binary== 1)
P_FAIL_2010_2015 = 1-P_PASS_2010_2015
PASS <- round(c(P_PASS_1970_1975,
P_PASS_1975_1980,
P_PASS_1980_1985,
P_PASS_1985_1990,
P_PASS_1990_1995,
P_PASS_1995_2000,
P_PASS_2000_2005,
P_PASS_2005_2010,
P_PASS_2010_2015),2)
FAIL <- round(c(P_FAIL_1970_1975,
P_FAIL_1975_1980,
P_FAIL_1980_1985,
P_FAIL_1985_1990,
P_FAIL_1990_1995,
P_FAIL_1995_2000,
P_FAIL_2000_2005,
P_FAIL_2005_2010,
P_FAIL_2010_2015),2)
PASS_names <- c("1975_1980",
"1980_1985",
"1985_1990",
"1990_1995",
"1995_2000",
"2000_2005",
"2005_2010",
"2010_2015")
# What percentage of movies passed the test, based on the binary definition?
P_PASS <- sum(binary)/length(binary)
# The percentage of movies passed the test is
P_PASS
# How does the percentage of movies passing change over time?
#YES, do watch the Histogram, he show one change in each period.
hist(year,main = "Frequency of Movie Produced by Year")
legend("topleft",legend = paste(PASS_names, " = ", PASS,"%"),
cex=.575,title = "Percentage of PASS in \n Each Interval")
legend("top",legend = paste(PASS_names, " = ", FAIL,"%"),
cex=.575,title = "Percentage of FAIL in \n Each Interval")
year_movie <- seq(1970,2013,1);year_movie
table(dataframe$year)
Num_Pass <- 0
Num_Fail <- 0
percentage <- 0
for(i in 1:length(year_movie)){
Num_Pass[i] <- sum(dataframe[year==year_movie[i],]$binary)
Num_Fail[i] <-length(dataframe[year==year_movie[i],]$binary)-Num_Pass[i]
#percentage of movie what PASS in each year
percentage[i] <- Num_Pass[i]/(Num_Pass[i]+Num_Fail[i])
}
plot(year_movie,percentage,type = "l",lty=2,col="blue",
main="Percentage of Movie What PASS In Each Year",
xlab = "Year of Movie",ylab = "Percentage in Each Year")
plot(year_movie,Num_Pass,type = "l",lty=2,col="blue",
main="Number of Movie what PASS In Each Year",
xlab = "Year of Movie",ylab = "Quanties of Movie")
#Time Series of Proportion of Movie
plot(year_movie,Num_Pass/(sum(Num_Pass)),type = "l",lty=2,col="blue",
main=" Time Series of Proportion of Movie what PASS",
xlab = "Year of Movie",ylab = "Proportion")
# ii) Propose a model that will predict
#install.packages("asbio")
library(asbio)
lms <- lm(binary~budget+domgross+intgross)
model_predict <- step(lms,direction = "backward")# Model Predictive
#verify the locat of the archive
teste_csv <- read.csv("C:/test.csv",header = T,dec=".",sep=",")
head(teste_csv)
uns <- rep(1,dim(teste_csv)[1])
matr <- matrix(c(uns,teste_csv$budget,teste_csv$domgross,
teste_csv$intgross),
dim(teste_csv)[1],4,byrow = F);matr
predicti <- model_predict$coefficients%*%t(matr)[,1:dim(teste_csv)[1]]
classify <- 0
for(i in 1:dim(teste_csv)[1]){
ifelse(predicti[,i] >= 0.50000000, classify[i] <- 1, classify[i] <- 0)
}
mdb <- cbind.data.frame(imdb = teste_csv$imdb,classify)
head(mdb)
write.csv(mdb,file = "Users/c4r10s/Documents/mdb.csv")