-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_analysis.R
145 lines (112 loc) · 4.99 KB
/
run_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
## This script get and clean the UCI_HAR Dataset for the data scientist application
# The tidy data includes all variables from original dataset, because
# this variables should be used in many applications e.g( random forest, neural networks, ...)
#
# Project: Getting and Cleanning Data, creating a tidy dataset
# Author: Leard Fernandes
tidy_data<-function(){
#Verify for the folder, if not exists, create it, otherwise, continues
if(!file.exists("data")){
dir.create("./data")
}
##Dowload and unpack the DataSet
#Set URL for the file
fileUrl<-"https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip"
#download file, if not exists
if(!file.exists("data//UCI_HAR_Dataset.zip")){
download.file(fileUrl, destfile="./data//UCI_HAR_Dataset.zip")
#unpack zip file
unzip("./data/UCI_HAR_Dataset.zip")
} #End of Download and Pack
##Setting the column names
#data set for colunm names of X train and test
labels<-read.table("data//UCI HAR Dataset//features.txt") #load the dataset
names(labels)<-c("id", "name") #Set column name
#formatting the columns names
labels$name<-tolower(labels$name) #set names to lower case
labels$name<-sub("(\\(\\))","", labels$name) #removing the parenthesis ()
labels$name<-gsub("(\\()","-", labels$name) #substituting the "(" for "-"
labels$name<-gsub("\\)","", labels$name) #substituting the ")" for ""
labels$name<-gsub(",","-", labels$name) #substituting the "," for "-"
##setting new names for columns
#the columns from index 303 to 344 (Group 1) are repetead 3 times (Subgroups 1, 2 and 3)
#the columns from index 382 to 423 (Group 2) are repetead 3 times (Subgroups 1, 2 and 3)
#the columns from index 461 to 502 (Group 3) are repetead 3 times (Subgroups 1, 2 and 3)
#config
index_group=c(303, 382, 461) #index for the group repetead
len=42 #legth of group of names
repetead=3 #number of repetitions
for(i in index_group){
j=0
while(j < repetead){ #Three main groups
init=(i+(len/repetead)*(j)) #index for beggining of subgroups
end=(i+(len/repetead)*(j+1)-1) #index for the end of subgroups
labels$name[init:end]<-paste0(labels$name[init:end], paste0("-b", j+1))
#print(paste0(labels$name[init:end], paste0("-b", j+1)))
j=j+1
}
}
##End of Columns Names Config
##Load the activities names for the descriptive variable activity
#activities Names
activities<-read.table("data//UCI HAR Dataset//activity_labels.txt") #load the dataset
names(activities)<-c("id", "activity") #Set column name
##Read the DataSets
#read data Train
subject_train<-read.table("data//UCI HAR Dataset//train//subject_train.txt") #load the subject dataset
y_train<-read.table("data//UCI HAR Dataset//train//y_train.txt") #load the y dataset
X_train<-read.table("data//UCI HAR Dataset//train//X_train.txt") #load the X dataset
#read data Test
subject_test<-read.table("data//UCI HAR Dataset//test//subject_test.txt") #load the subject dataset
y_test<-read.table("data//UCI HAR Dataset//test//y_test.txt") #load the y dataset
X_test<-read.table("data//UCI HAR Dataset//test//X_test.txt") #load the X dataset
##Configure the names of columns
#set the column names of train datasets
names(subject_train)<-c("subject")
names(y_train)<-c("activity")
names(X_train)<-labels$name
#set the column names of test datasets
names(subject_test)<-c("subject")
names(y_test)<-c("activity")
names(X_test)<-labels$name
##Combine the Datasets X, y and subject
#Combine the Train DataSet
syx_train<-cbind(subject_train, y_train, X_train)
#Remove the unused data from workspace
rm(subject_train,y_train,X_train)
#Combine the Test DataSet
syx_test<-cbind(subject_test, y_test, X_test)
#Remove the unused data from workspace
rm(subject_test,y_test,X_test)
#Merge the Train and Test Datasets
syx_merged<-merge(syx_train, syx_test, all=T)
#Remove the unused data from workspace
rm(syx_train,syx_test)
#End of Combining and Merge Datasets
##Set the descriptive variables in the DataSet
#Seetting subject as Factor
syx_merged$subject<-factor(syx_merged$subject)
#Seetting activity as Factor
syx_merged$activity<-cut(syx_merged$activity, 6, activities$activity)
#Calculate the mean of dataset by subject and activity
syx_mean<-aggregate(syx_merged[-c(1,2)], by = syx_merged[c(1,2)], FUN=mean)
#Calculate the standard deviation of dataset by subject and activity
syx_sd<-aggregate(syx_merged[-c(1,2)], by = syx_merged[c(1,2)], FUN=sd)
#Remove syx_merged
rm(syx_merged)
#Setting new column names for Mean
labels$nameMean<-paste0(labels$name, "-mean")
names(syx_mean)[-c(1,2)]<-labels$nameMean
#Setting new column names Standard Deviation
labels$nameSd<-paste0(labels$name, "-sd")
names(syx_sd)[-c(1,2)]<-labels$nameSd
#Merge Mean and SD Datas
syx_tidy<-merge(syx_mean, syx_sd, all=T)
#remove sys_mean and syx_sd
rm(syx_mean, syx_sd)
#Reorder by subject and activity
syx_tidy<-syx_tidy[order(syx_tidy$subject, syx_tidy$activity),]
#Save on file
write.table(syx_tidy, "data/UCI_HAR_tydy_Dataset.txt", row.names=FALSE)
syx_tidy
}