forked from rdpeng/RepData_PeerAssessment1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPA1_template.Rmd
executable file
·141 lines (112 loc) · 6.55 KB
/
PA1_template.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
---
title: 'Reproducible Research: Peer Assessment 1'
author: "Victor Bernal"
date: "`r format(Sys.Date(), '%d %B, %Y')`"
output: "html_document"
---
```{r setup, include=FALSE, echo=TRUE, tidy=TRUE}
knitr::opts_chunk$set(echo = TRUE)
```
# Synopsis
Here we analyse data about personal movement collected by activity monitoring devices e.g. Fitbit, Nike Fuelband, or Jawbone Up. Activity data is collected at 5 minute intervals through out the day during October and November, 2012 and include the number of steps taken in 5 minute intervals each day. For more details <https://www.coursera.org/learn/reproducible-research/peer/gYyPt/course-project-1>.
## Loading and preprocessing the data
```{r}
# Uncomment to download the file
#library(formatR)
fileurl <- 'https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip'
if(!file.exists('activity.csv')) {
download.file(fileurl, 'repdata_data_activity.zip', method='curl')
unzip(zipfile='repdata_data_activity.zip')
file.remove('repdata_data_activity.zip')
DATA <- read.csv('activity.csv')
} else {
unzip(zipfile='activity.zip')
DATA <- read.csv('activity.csv')
}
```
# What is mean total number of steps taken per day?
The variables included in this dataset are
`r colnames(DATA)` and there are a total of
`r nrow(DATA)` observations in this dataset.
```{r , echo = TRUE}
DATA$date<-as.factor(DATA$date)
total_steps<-by(DATA$steps, DATA$date, function(x){sum(x, na.rm = TRUE)} )
hist(total_steps, main='', breaks = 15, col = 'grey', xlab='total steps by day', ylim=c(0,30))
abline(v=mean(total_steps, na.rm=TRUE), col="blue", lwd=2)
abline(v=median(total_steps, na.rm=TRUE), col="red", lwd=2)
text(x=-3000+mean(total_steps, na.rm=TRUE), y = 15,"mean", col='blue')
text(x= 3000+median(total_steps, na.rm=TRUE), y = 12,"median", col='red')
```
The mean is `r round(mean(total_steps, na.rm=TRUE))`, and the median is `r signif(median(total_steps, na.rm=TRUE),4)`.
# What is the average daily activity pattern?
```{r, echo = TRUE}
DATA$interval<-as.factor(DATA$interval)
steps_5min<-by(DATA$steps, DATA$interval, function(x){mean(x, na.rm = TRUE)} )
se_5min<-by(DATA$steps, DATA$interval, function(x){sd(x, na.rm = TRUE)/sqrt(sum(!is.na(x)))} )
plot(x= as.numeric(as.matrix(unique(DATA$interval))), y= matrix(unlist(steps_5min)), type="l",
ylab ="steps", xlab ="interval" )
#arrows(x0= as.numeric(as.matrix(unique(DATA$interval))) , y0=c(as.numeric(steps_5min-2*se_5min)),
# x1= as.numeric(as.matrix(unique(DATA$interval))), y1=as.numeric(steps_5min+2*se_5min), length=0.05, angle=90, code=3, col=rgb(0.1,0.1,0.1,0.5))
abline(h=max(steps_5min, na.rm=TRUE), col=rgb(0.5,0,0,0.5))
abline(v=as.numeric(as.matrix(unique(DATA$interval)))[which(steps_5min==max(steps_5min, na.rm=TRUE))], col=rgb(0.5,0,0,0.5))
```
## Imputing missing values
Replace each missing value with the mean value of its 5-minute interval. There are
`r sum(is.na(DATA$steps))` NAs in steps,
`r sum(is.na(DATA$interval))` NAs in interval,
`r sum(is.na(DATA$date))` NAs in date. There are only Nas in steps.
## Impute the NA with the interval's mean.
```{r, echo = TRUE}
IMPUTED_DATA <- DATA
ave_interval<-aggregate(DATA$steps, list(DATA$interval), FUN=function(x){mean(x,na.rm=TRUE)})
colnames(ave_interval)<-c("interval","ave")
idx<- match( DATA$interval[is.na(DATA$steps)] , ave_interval$interval)
#sum( as.numeric(DATA$interval[is.na(DATA$steps)]) - as.numeric(ave_interval$interval[idx]) )
IMPUTED_DATA$steps[is.na(DATA$steps)]<- ave_interval$ave[idx]
imputed_total_steps<-by(IMPUTED_DATA$steps, IMPUTED_DATA$date, function(x){sum(x, na.rm = TRUE)} )
```
```{r, echo = TRUE}
hist(total_steps, main='', breaks = 15, col = rgb(0.5,0.5,0.5,0.3), xlab='total steps by day' , ylim = c(0,30))
hist(imputed_total_steps, main='', breaks = 15, col = rgb(0.5,0,0,0.3), add=TRUE)
legend(x=12000,y=30,
c("non-imputed", "imputed"),
pch=c(20, 20),
col=c(rgb(0.5,0.5,0.5,0.3),rgb(0.5,0,0,0.3)),
bty = "n")
abline(v = mean(imputed_total_steps, na.rm=TRUE), col="blue", lwd=5)
abline(v = median(imputed_total_steps, na.rm=TRUE), col="red", lwd=2)
text(x= -4000+ round(mean(imputed_total_steps, na.rm=TRUE)), y = 15,"new mean", col='blue')
text(x= 4000+ round(median(imputed_total_steps, na.rm=TRUE)), y = 15,"new median", col='red')
```
The mean is `r round(mean(imputed_total_steps, na.rm=TRUE))`, and the median is `r signif(median(imputed_total_steps, na.rm=TRUE),4)`.
# Differences between weekdays and weekends.
How many weekdays/ends are in the data?
```{r, echo = TRUE, comment = ""}
#class(IMPUTED_DATA$date) # change to format Date
IMPUTED_DATA$date <- as.Date(IMPUTED_DATA$date)
IMPUTED_DATA$day <- weekdays(as.Date(DATA$date))
#table(IMPUTED_DATA$day)
IMPUTED_DATA$day[IMPUTED_DATA$day =="zaterdag"]<-"weekend"
IMPUTED_DATA$day[IMPUTED_DATA$day=="zondag"]<-"weekend"
#table(IMPUTED_DATA$day)
IMPUTED_DATA$day[IMPUTED_DATA$day !="weekend"]<-"weekday"
IMPUTED_DATA$day<-as.factor(IMPUTED_DATA$day)
wend<-subset(IMPUTED_DATA, day=="weekend", select=c(steps, interval))
wday<-subset(IMPUTED_DATA, day=="weekday", select=c(steps, interval))
wend_steps_5min<-by(wend$steps, wend$interval, function(x){mean(x, na.rm = TRUE)} )
wdays_steps_5min<-by(wday$steps, wday$interval, function(x){mean(x, na.rm = TRUE)} )
```
Are there differences in activity patterns between weekdays and weekends?
Re-label the dates in weekdays and weekends (using subset)
```{r, echo = TRUE, comment = ""}
table(IMPUTED_DATA$day)
#par(mfrow=c(2,1))
plot(x= as.numeric(as.matrix(unique(DATA$interval))), y= matrix(unlist(wdays_steps_5min)), type = "l", col='red', ylab='steps weekdays', xlab='interval' , ylim=c(0,210))
abline(h=max(wdays_steps_5min, na.rm=TRUE), col=rgb(0.5,0,0,0.5))
abline(v=as.numeric(as.matrix(unique(DATA$interval)))[which(wdays_steps_5min==max(wdays_steps_5min, na.rm=TRUE))], col=rgb(0.5,0,0,0.5))
plot(x= as.numeric(as.matrix(unique(DATA$interval))), y= matrix(unlist(wend_steps_5min)), type = "l", col='blue', ylab='steps weekeneds', xlab='interval', ylim=c(0,210))
abline(h=max(wend_steps_5min, na.rm=TRUE), col=rgb(0.5,0,0,0.5))
abline(v=as.numeric(as.matrix(unique(DATA$interval)))[which(wend_steps_5min==max(wend_steps_5min, na.rm=TRUE))], col=rgb(0.5,0,0,0.5))
```
# Conclusions
We observe a similar spikes weekdays on `r as.numeric(as.matrix(unique(DATA$interval)))[which(wdays_steps_5min==max(wdays_steps_5min, na.rm=TRUE))]`, and weekends on to `r as.numeric(as.matrix(unique(DATA$interval)))[which(wend_steps_5min==max(wend_steps_5min, na.rm=TRUE))]`. After the maximum spike, the number of steps during weekdays are lower than on weekends. This is as expected during office hours.