getting-cleaning-data/run_analysis.R at master · Yegor-Budnikov/getting-cleaning-data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
library(dplyr)


#----------- Get some names -----------#

#read the list of names of features
feature_names <- read.table("./UCI HAR Dataset/features.txt")

#read the names of activities
activity_names <- read.table("./UCI HAR Dataset/activity_labels.txt")

#set the proper column names
names(activity_names) <- c("activity_id", "activity_name")

#read the list of subjects from the training set
subject_train <- read.table("./UCI HAR Dataset/train/subject_train.txt")

#read the list of subjects from the test set
subject_test <- read.table("./UCI HAR Dataset/test/subject_test.txt")


#----------- Get the training set -----------#

#read the training set
X_train <- read.table("./UCI HAR Dataset/train/X_train.txt")

#set the features names as column names of the thaining set
names(X_train) <- features_names[,2]

#read the labels of the training set
y_train <- read.table("./UCI HAR Dataset/train/y_train.txt")

#set the proper column names
names(y_train) <- "label_id"

#add labels to the training set
X_train$activity_id <- y_train$label_id

#add subjects to the training set
X_train$subject_id <- subject_train[,1]

#define each label with proper activity names
X_train <- merge(X_train, activity_names, by.x="activity_id", by.y="activity_id", all=TRUE)


#----------- Get test set -----------#

#read the test set
X_test <- read.table("./UCI HAR Dataset/test/X_test.txt")

#set the features names as column names of the test set
names(X_test) <- features_names[,2]

#read the labels of the test set
y_test <- read.table("./UCI HAR Dataset/test/y_test.txt")

#set the proper column names
names(y_test) <- "label_id"

#add labels to the test set
X_test$activity_id <- y_test$label_id

#add subjects to the test set
X_test$subject_id <- subject_test[,1]

#define each label with proper activity names
X_test <- merge(X_test, activity_names, by.x="activity_id", by.y="activity_id", all=TRUE)


#----------- Merge sets -----------#

data_set <- rbind(X_train, X_test)


#----------- Look for the measurements on mean -----------#

#get indeces of measurements on the mean
mean_idx <- grep("mean\\(\\)", names(data_set))

#get old names of these measurements
old_names_mean <- names(data_set[, mean_idx])

#replace "-mean()" with "Mean"
new_names_mean <- gsub("-mean\\(\\)", "Mean", old_names_mean)

#replace "-X" with "X", "-Y" with "Y" and "-Z" with "Z"
new_names_mean <- gsub("-", "", new_names_mean)


#----------- Look for the measurements on standard deviation -----------#

#get indeces of measurements on the standard deviation
std_idx <- grep("std\\(\\)", names(data_set))

#get old names of these measurements
old_names_std <- names(data_set[, std_idx])

#replace "-std()" with "Std"
new_names_std <- gsub("-std\\(\\)", "Std", old_names_std)

#replace "-X" with "X", "-Y" with "Y" and "-Z" with "Z"
new_names_std <- gsub("-", "", new_names_std)


#----------- Cleaning the data -----------#

#replace old names of the proper columns with the new ones
names(data_set)[c(mean_idx, std_idx)] <- c(new_names_mean, new_names_std)

#catch indeces of columns with subject id and activity name
activity_idx <- match("activity_name", names(data_set))
subject_idx <- match("subject_id", names(data_set))

#leave only those columns that were collected with the mean() and std()
data_set <- data_set[, c(subject_idx, mean_idx, std_idx, activity_idx)]


#----------- Getting the tidy data -----------#

#get summarizing over columns
tidy_data <- aggregate(data_set[c(2:(dim(data_set)[2]-1))], by=data_set[c("subject_id","activity_name")], FUN=mean)

#Some modification of names of coulumns
names(tidy_data)[3:dim(tidy_data)[2]] <- paste("Mean of", names(cdata)[3:dim(tidy_data)[2]])