Skip to content

Latest commit

 

History

History
209 lines (151 loc) · 6.25 KB

PA1.md

File metadata and controls

209 lines (151 loc) · 6.25 KB

Reproducible Research: Peer Assessment 1

14 June 2014

Introduction

This assignment demonstrates the use of R Markdown in conjunction with the knitr package to produce literate code. Forked from rdpeng/RepData_PeerAssessment1.

Loading and preprocessing the data

  1. Load packages plyr and ggplot2

    library(plyr)
    library(ggplot2)
  2. Load data from file activity.csv

    dat <- read.csv("./data/activity.csv")
  3. Transform the 'date' column from factor into date

    dat <- transform(dat, date = as.Date(date, format = "%Y-%m-%d"))
    str(dat)
    ## 'data.frame':	17568 obs. of  3 variables:
    ##  $ steps   : int  NA NA NA NA NA NA NA NA NA NA ...
    ##  $ date    : Date, format: "2012-10-01" "2012-10-01" ...
    ##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...
    
  4. Remove NAs for first analyses

    datNoNA <- dat[!is.na(dat$steps), ]

What is mean total number of steps taken per day?

  1. Summarise steps by date (excluding NAs)

    stepsDate <- ddply(datNoNA, "date", summarise, steps = sum(steps))
  2. Chart steps by date as a bar plot with mean and median lines

    histStepsDate <- function(data) {
        # Bar plot of steps by date without NAs
        p <- qplot(date, steps, data = data, geom = "bar", stat = "identity")
        
        # Horizontal line at mean
        pMean <- mean(data$steps)
        p <- p + geom_hline(yintercept = pMean, color = "red", size = 2)
        # Label mean line
        posX <- with(data, date[ceiling(length(date) / 2)])
        posY <- 20000
        label <- paste("Mean =", format(pMean, nsmall = 1))
        p <- p + annotate("text", x = posX, y = posY, label = label,
                          color = "red")
        
        # Horizontal line at median
        pMedian <- median(data$steps)
        p <- p + geom_hline(yintercept = pMedian, color = "blue", linetype = 2,
                            size = 1)
        # Label median line
        posY <- 19000
        label <- paste("Median =", format(pMedian, nsmall = 1))
        p <- p + annotate("text", x = posX, y = posY, label = label,
                          color = "blue")
        
        # Return plot
        p
    }
    histStepsDate(stepsDate) + labs(title = "Total Steps by Date (Remove NAs)")

    plot of chunk histStepsDate

What is the average daily activity pattern?

  1. Summarise steps by interval (excluding NAs)

    stepsInterval <- ddply(datNoNA, "interval", summarise, steps = mean(steps))
  2. Chart steps by interval as a line with max

    lineStepsInterval <- function(data) {
        # Bar plot of steps by date without NAs
        p <- qplot(interval, steps, data = data, geom = "line")
        
        # Label max
        stepsMax <- max(data$steps)
        intvMax <- subset(data, steps == stepsMax, select = interval)[[1]]
        posX <- intvMax + 825L
        posY <- stepsMax
        label <- paste("Interval =", intvMax, "| Max Avg Steps = ",
                       format(stepsMax))
        p <- p + annotate("point", x = intvMax, y = stepsMax, color = "red",
                          size = 5)
        p <- p + annotate("text", x = posX, y = posY, color = "red",
                          label = label)
        
        # Return plot
        p
    }
    lineStepsInterval(stepsInterval) + labs(title =
                                            "Mean Steps by Interval (Remove NAs)")

    plot of chunk lineStepsInterval

Imputing missing values

  1. Replace NAs with mean for interval

    replaceNA <- function(sSteps, iInterval, data = stepsInterval) {
        ifelse(is.na(sSteps),
               subset(data, iInterval == interval, select = steps)[[1]],
               sSteps)
    }
    datReplaceNA <- ddply(dat, c("date", "interval"), transform,
                          steps = replaceNA(steps, interval))
  2. Summarize steps by date and chart as a bar plot with mean and median lines

    datTemp <- ddply(datReplaceNA, "date", summarise, steps = sum(steps))
    histStepsDate(datTemp) + labs(title = "Total Steps by Date (Replace NAs)")

    plot of chunk histReplaceNA

    rm(datTemp)
  3. Summarize steps by interval and chart as a line with max

    datTemp <- ddply(datReplaceNA, "interval", summarise, steps = mean(steps))
    lineStepsInterval(datTemp) + labs(title =
                                      "Mean Steps by Interval (Replace NAs)")

    plot of chunk avgReplaceNA

    rm(datTemp)

Are there differences in activity patterns between weekdays and weekends?

  1. Add weekday character colum to data

    weekDayOrEnd <- function(date) {
        ifelse(format(date, format = "%w") %in% 1:5,
               "weekday",
               "weekend")
    }
    datReplaceNA <- transform(datReplaceNA, weekday = weekDayOrEnd(date))
  2. Summarise steps by interval categorized by weekday/weekend

    stepsSplit <- ddply(datReplaceNA, c("interval", "weekday"), summarise,
                        steps = mean(steps))
  3. Chart steps by interval as a line with facets by weekday/weekend

    p <- qplot(interval, steps, data = stepsSplit, geom = c("line", "smooth"),
               facets = . ~ weekday)
    p + labs(title = "Mean Steps by Interval (Replace NAs)")
    ## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
    ## geom_smooth: method="auto" and size of largest group is <1000, so using loess. Use 'method = x' to change the smoothing method.
    

    plot of chunk lineStepsSplit

    rm(p)