-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathS1_read_data.R
102 lines (77 loc) · 3.34 KB
/
S1_read_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# This model is linear model for HMSC ROIZ DATA without the culex theilerri data that affect model fittings.
getwd()
# READING IN SXY: study design (S) and/or covariates (X) and species data (Y)
SXY = read.csv("Msq_donana_data.csv", stringsAsFactors=TRUE, header = TRUE) # or use read.csv2 when values are semicolon-separated
# S: study design, including units of study and their possible coordinates (named as Route_x and Route_y to indicate that they relate to the units of Route)
# X: covariates to be used as predictors
# Y: species data
# If you don't have variables that define the study design, indicate this by S=NULL
# If you don't have covariate data, indicate this by X=NULL
View(SXY)
str(SXY)
# colnames(SXY) <- c("fid", "field_1","trap", "trapnb", "Unit", "Unit_Class", "Unit_code", "Oc.detritu", "Oc.caspius",
# "Cx.theiler", "Cx.pipiens", "Cx.modestu", "Cx.perexig", "An.atropar", "IShannon", "Total_mosq", "Riqueza_me",
# "HIDRO_MEAN_250", "HIDRO_MEAN_500", "HIDRO_MEAN_1000", "HIDRO_MEAN_2000", "LULC_100", "LULC_250", "LULC_500",
# "LULC_1000", "LULC_2000", "NDVI_100", "NDVI_250", "NDVI_500", "NDVI_1000", "NDVI_2000", "DIST_RIO",
# "DIST_ARROZ", "DIST_urban", "Norte","Oeste", "LST_1")
#write.csv(SXY, "Msq_donana_data.csv", row.names = FALSE)
#variable selection.
#checking collinearity in the predictor
tPred<- SXY[, 18:37]
View(tPred)
tPred <-tPred[, -c(5:9, 18,19)]
View(tPred)
# install.packages('usdm')
library(usdm) # for performing variance inflation factor etc
tPred_vif <- vif(tPred)
tPred_vif
tPred_vifCor <- vifcor(tPred, th=0.7)
tPred_vifCor
tPred_vifstep <- vifstep(tPred, th=5)
tPred_vifstep
df_sp <- data.frame(tPred, SXY$Unit_code)
View(df_sp)
# rename column name from spanish to english
names(SXY)[35] <- paste0("North")
names(SXY)[36] <- paste0("East")
# studyDesign=SXY[, c("Unit_Class", "trap")]
# studyDesign= data.frame(apply(studyDesign, 2, as.factor))
# class= HmscRandomLevel(units = studyDesign$Unit_Class)
# trap= HmscRandomLevel(units= studyDesign$trap)
S=SXY[, c(3, 35:36)]
X=SXY[,c(6, 18, 31, 32, 33, 34, 37)] # Predictors
Y=SXY[,c(8:14)] # response variable
View(Y)
Y2 <- Y
i <- c(1:7)
# convert the decimal number into nearest interger
Y2[,i ] <- apply(Y2[, i], 2, function(x)as.numeric(ceiling(x)))
View(Y2)
# Check that the data looks as it should!
View(S)
View(X)
View(Y)
# check that community data are numeric and have finite numbers. If the script
# writes "Y looks OK", you are ok.
if (is.numeric(as.matrix(Y)) || is.logical(as.matrix(Y)) && is.finite(sum(Y, na.rm=TRUE))) {
print("Y looks OK")
} else {
print("Y should be numeric and have finite values") }
if (is.numeric(as.matrix(Y2)) || is.logical(as.matrix(Y2)) && is.finite(sum(Y2, na.rm=TRUE))) {
print("Y2 looks OK")
} else {
print("Y2 should be numeric and have finite values") }
# Check that the stydy design data do not have missing values (they are allowed for Y but not S, X, P or Tr)
if (any(is.na(S))) {
print("S has NA values - not allowed for")
} else {
print("S looks ok") }
# Check that the covariate data do not have missing values (they are allowed for Y but not S, X, P or Tr)
if (any(is.na(X))) {
print("X has NA values - not allowed for")
} else {
print("X looks ok") }
getwd()
Y=as.matrix(Y)
Y2=as.matrix(Y2)
save(S,X,Y,Y2, file = 'data/allData.R')