R_Script_Template.R

#-[ Header ] ------------------------------------------------------------------
# Program       : Framework for R scripts
# Author        : Georg Maubach
# Date          : 2016-03-03
# Update        : 2016-08-16
# Description   : Foundation for the analysis process
# Source System : R 3.2.5 (64 Bit)
# Target System : R 3.2.5 (64 Bit)
# Release       : 1
# File Name     : R_Script_Template.R
#-------------------------------------------------------------------------------

#- [ Purpose of the document ] ------------------------------------------------
# This document provides a framework for a script able to handle real world 
# data throughout the complete analysis process. In each step examples or 
# prototypes of needed or helpful commands are given. Chapters and sections in 
# this document can be regarded as a toolbox. The needed tools shall be adapted 
# to the processed data. Commands are ordered an a consistent way to support the
# user to produce high quality output.
#-------------------------------------------------------------------------------

# - [ At hand ] ----------------------------------------------------------------
# help("function")    # Extract or Replace Parts of an Object
# example("function") # Examples on "Extract"
# demo(package = .packages(all.available = TRUE)) # Show demos of packages
#-------------------------------------------------------------------------------

# - [ Editing Marks ] ----------------------------------------------------------
# %ROTA% : Result of the analysis in text form if needed to explain further
#          steps
# %ANDE% : Analyst Decision
# %ToDo% : ToDo's
#-------------------------------------------------------------------------------

# - [ Warrenty Disclaimer ] ----------------------------------------------------
# The software is provided "as-is". The author disclaims to the fullest extent
# authorized by law any and all warranties, whether express or implied,
# including, without limitation, any implied warranties of merchantability or
# fitness for a particular purpose. Without limitation of the foregoing, the
# author expressly does not warrant that:
#
# (a) the software will meet your requirements or expectations;
# (b) the software or the software content will be free of bugs, errors,
#     viruses or other defects;
# (c) any results, output, or data provided through or generated by the software
#     will be accurate, up-to-date, complete or reliable;
# (d) the software will be compatible with third party software;
# (e) any errors in the software will be corrected.
#-------------------------------------------------------------------------------

# - [ Limitation of Liability ] ------------------------------------------------
# In no event will the author be liable for any direct, indirect, consequential,
# incidental, special, exemplary, or punitive damages or liabilities whatsoever
# arising from or relating to the software, the software content or this
# agreement, whether based on contract, tort (including negligence), strict
# liability or other theory, even if the author has been advised of the
# possibility of such damages.
#
# The use of the software goes to the whole risk of the user.
#-------------------------------------------------------------------------------

#--------1---------2---------3---------4---------5---------6---------7---------8

#-----------#
# Functions #
#-----------#


# Clear console
cat("\014")

#-------#
# Setup #
#-------#
# Environment
# Please make sure that RTools is installed
Sys.getenv("R_ZIPCMD", "zip")
# needed for openxlsx::write.xlsx()
Sys.setenv(R_ZIPCMD= "C:/R-Project/Rtools/bin/zip")

.libPaths()  # Install directory for libraries
# .libPaths("new path if needed")

# Workplace
sessionInfo()        # Environment
list.files(R.home()) # Show R home directory
getwd()              # Get working directory
list.dirs()          # List directories in working directory
list.files()         # List files in working directory
library()            # List all installed packages
search()             # List all loaded packages
ls()                 # List objects in environment

#-----------#
# Configure #
#-----------#
path <- file.path("path", "to","directory")
setwd(path)          # Set working directory
options(width = 65)  # Set output width

#---------#
# Install #
#---------#
available.packages()

# Desired packages
my_packages <- c(
  "ctv",           # Package to install packages based on themes
  "data.table",    # Fast manipulation of large datasets
  "desc",          # Descriptive statistics
  "dplyr",         # Data manipulation for data frames
  "geoR",          
  "haven",         # import data from stastical packages
  "Hmisc",         
  "httr",          # package to deal with HTTP requests
  "installr",      # Dependency of openxlsx::write.xlsx()
  "lubridate",     
  "mapdata",       # data for high-quality maps
  "maps",          # draw maps
  "maptools",      # import ESRI data
  "memisc",        # package data import and management for SPSS
  "openxlsx",      # Read and write Excel files
  "questionr",     # Statistical functions for convenience
  "reshape2",      # Restructure data
  "stringr",       
  "tidyr",         # Data cleaning
  "plotrix",       
  "plyr",          # Data manipulation
  "prettyR",       # Descriptive statistics and formatting output
# "rattle",        # Be careful! rattel needs a different RGTK2 lib!
  "RColorBrewer",  # Install ColorBrewer color palettes
  "Rcpp",
  "rgdal",         # Connect to GDAL, Mercator transformation
  "RMySQL",        # Replace by package for your database
  "sp",            # Draw maps, depends on grid and lattice
  "sqldf",         # Execute SQL queries on R datasets
  "zoo"            # Time series analysis
)

# Install
## EITHER: Install from CRAN
## install.packages(pkgs = my_packages, dependencies = TRUE)

## OR: Build local CRAN repository
### EITHER: Install miniCRAN from CRAN
### install.packages(pkgs = "miniCRAN", dependencies = TRUE)
### OR: Install miniCRAN from localhost
install.packages(pkgs = "C:/Software/R-Project/CRAN/bin/windows/contrib/3.2/xml2_0.1.2.zip",
                 dependencies = TRUE,
                 repos = NULL)
install.packages(pkgs = "C:/Software/R-Project/CRAN/bin/windows/contrib/3.2/miniCRAN_0.2.5.zip",
                 dependencies = TRUE,
                 repos = NULL)

library(miniCRAN)

### Determine the dependencies for desired packages
pkg_list <- pkgDep(pkg = my_packages, suggest = TRUE)

### Define a path to local repository
repo <- file.path("H:","2016","Software","R-Project", "CRAN")

# Create repository
# Internet connection required
# If no internet connection available download at machine with internet
# connection and transfer the downloaded repository manually to the target
# machine.
makeRepos(pkgs = pkg_list, path = repo, type = "win.binary")

repo_path = paste0("file:", repo)

install.packages(pkgs = pkg_list,
                 dependencies = TRUE,
                 repos = repo_path)

# Install packages bases on themes
# see (*6) p. 26
install.packages("ctv")
library(ctv)
install.views("SocialSciences")

# update.packages(ask = FALSE)     # Update (if necessary)

# vignette(all = TRUE)             # Show vignettes of all installed packages

#--------#
# Import #
#--------#
# rm(list = ls()) # Clear workplace (if necessary)
                  # Comment out if handed out to someone else
                  # for not deleting somebody else's workplace

# Load data
path <- file.path("path", "to", "filename")
load(path)
  
# Import data
# see (*10)
path <- file.path("path", "to", "filename")

## From Spreadsheets, e. g. Microsoft Excel
library(readxl)

### Read only one Excel sheet
path <- file.path("path", "to", "filename")

sheet1 <- read_excel(path, sheet = 1)
sheet2 <- read_excel(path, sheet = 2)

sheets <- excel_sheets(path)
first_sheet <- read_excel(path, sheet = "first_sheet")
first_sheet <- read_excel(path, sheet = "second_sheet")

sheets <- excel_sheets(path)
first_sheet <- read_excel(path, 
                          sheet = sheets[1],
                          col_names = TRUE | FALSE | c("name1", "name2", ...),
                          skip = n)

columns <- c("Column_Name_1", paste0("Column_Name_", 2:n))
first_sheet <- read_excel(path,
                          sheet = sheets[2],
                          col_names = columns,
                          col_types = c(NULL | "numeric" | "text" | "date" | "blank"))

### Read all Excel sheets within a workbook
### using lapply creating a list of data.frames
path <- file.path("path", "to", "filename")
my_workbook <- lapply(excel_sheets(path), 
                      read_excel, 
                      path = path)
detach("package:readxl")

library(XLConnect)

### Create a workbook object
path <- file.path("path", "to", "filename")

book <- loadWorkbook(filename = path)
sheets <- getSheets(book)
data <- readWorksheet(book,
                      sheet = 1 | "sheet name",
                      header = TRUE | FALSE,
                      startCol = n,
                      startRow = n,
                      endRow = n)

df <- data.frame("a data.frame")
createSheet(book, "sheet name")
writeWorksheet(book, df, "sheet_name")
saveWorkbook(book, file = path)

detach("package:XLConnect")

## From other statistical packages
### With haven
### From SAS

#### From SPSS
library(haven)

path <- file.path("path", "to", "filename")

dataset <- read_sav(path = path)

#### Convert the labelled class from SPSS to factor in R
dataset$variable <- haven::as_factor(dataset$variable)

detach("package:haven")

#### From STATA

### With foreign
library(foreign)

#### From SAS

#### From SPSS
dataset <- read.spss(file = path,
                     to.data.frame = TRUE,
                     # convert labelled variables to factors
                     use.value.labels = TRUE) 

#### From STATA
dataset <- foreign::read.dta(file = path,
                             convert.factors = FALSE
                             convert.underscore = FALSE)

detach("package:foreign")

## From Databases
library("DBI") # library(RMySQL) not required

con <- dbConnect(drv = RMySQL::MySQL(),
                 dbname = "database name",
                 host = "hostname",
                 port = port number,
                 user = "username",
                 password = "password")

tableList <- dbListTables(conn = con)

### Read entire database table
table <- dbReadTable(con, 
                     name = "table name")

### Read a selection respectively a subset of data from a database table
selection <- dbGetQuery(con,
                        statement = "SELECT col_name
                                     FROM table_name
                                     WHERE col_name > some_condition")
### Example 1
products <- dbReadTable(conn = con,
                        name = "Products")

products_selection <- subset(products,
                             subset = contract == 1)

### Example 2
### Produces the same result as Example 1
### but is more efficient because the subsetting is done in the database
### and only the needed entries are read into R.
products_selection <- dbGetQuery(conn = con,
                                 statement = "SELECT *
                                              FROM products_selection
                                              WHERE contract = 1")

### Example 3
### Read a database table one chunk at a time
res <- dbSendQuery(conn = con,
                   statement = "SELECT *
                                FROM products
                                WHERE contract = 1")

#### The data is stored in temporary file
while(!dbHasCompleted(res)) {
  chunk <- dbFetch(res, n = 1) # n can have any suitable value
  print(chunk)                 # work with chunk of data in any suitable way
}

dbClearResult(res) # deletes the file temporarily created by dbSendQuery

### Always disconnect from the database
dbDisconnect(conn = con)

detach("packages:DBI")

## From the web
## Check if the import function can access web sites
## right away using e. g. the file argument

## Downloading a file to you local machine
url <- "http://machine.server.com/path/to/filename"
destination <- file.path("path", "to", "filename")
download.file(url = url,
              destfile = destination)

### Import the locaclly stored file with the known import functions

## With httr
library(httr)

url <- "http://machine.server.com/path/to/filename"
response <- GET(url)
content <- content(x = response, as = data.frame)
detach("package:httr")

## With jsonlite
library("jsonlite")

fromJSON("string")

### JSON object: unordered collection of name:value pairs
### name = string
### value = string | number | boolean | null | JSON object | JSON array

### JSAN array: ordered sequence of objects

detach("packages:jsonlite")

# Build datasets from data
## Rename variables
names(dataset) <- c("new_colname1", "new_colname2", ...)
# (* 24)
names(dataset)[names(kino) == "variable name"] <- "new variable name"

## Sort variables
dataset2 <- dataset[sort(names(dataset))]

## Sort cases
## see (*1) p. 333ff
## Sorting can only be done for numeric variables.
### Save the original order
#### Row numbers are stored as characters.
#### Thus type conversion is necessary.
dataset$orig_order <- as.numeric(row.names(dataset))
### Missing values are placed at the end by default
#### na.last = FALSE places missing values at the beginning
#### na.last = NA removes missing values from sorted data
#### Order is ascending by default.
#### Reverse order is only available for numeric variables
#### and done using a minus sign ("-") before each variable.
dataset2 <- order(-as.numeric(dataset$gender), # descending gender
                  dataset$age)                 # ascending age

# Save data
path <- file.path("path", "to", "filename")
save.image(path)

# Match datasets
## Merge only two datasets at a time
## see (*1) p. 288
dataset3 <- merge(dataset1, dataset2,
                  # use by if the variable names match
                  by.x = id_variable_first_dataset, 
                  by.y = id_variable_second_dataset,
                  # use all = TRUE if both datasets deliver cases
                  all.x = TRUE, all.y = TRUE)

## Merge two or more datasets at a time
## %ToDo%

# Check
xlsx_check <- "Projectname_Checks.xlsx"
wb <- createWorkbook()
addWorksheet(wb,
             sheetName = "Import")
writeData(wb,
          sheet = "Import",
          x = dataset)
saveWorkbook(wb, 
             file = xlsxCheck,
             overwrite = TRUE)

# Create Recovery Point
path <- file.path("Path", "to", "file", "Projectname_Import.RData")
save(dataset, file = path)

#----------#
# Cleaning #
#----------#
# Recover data from previous section
path <- file.path("Path", "to", "file", "Projectname_Import.RData")
load(file = path)

## 1. Inspect raw data
### Get to know the structure of datasets
### see (*6) p. 59
class()
dim()
names()
str()
dplyr::glimpse()
summary()
memisc::codebook(dataset)

### Get to know the data within datasets
### see (*6) p. 59
### see (*8)
#### Print the first and last 10 rows = observations
print(dataset[c(1:10,(dim(dataset)[1] - 10):(dim(dataset)[1])), ])
head(dataset, n = 10)
tail(dataset, n = 10)
#print()
summary()
Hmisc::describe(dataset)
hist()
plot()

## see (*9)
## 2. Tidy the data according to the principals of tidy data
library(tidyr)
### a) Ensure observations/values are in rows
tidyr::gather()

### b) Ensure variables are in columns
tidyr::spread()

### c) each observation type is stored in its own dataset
tidyr::separate()

### d) Each table is one type of observational unit

detach("package:tidyr")

## 3. Type conversions
### Numericals
library(dplr)
dataset2 <- dplyr::mutate_each(dataset1, funs(as.numeric); var1:varX)
detach("package:dplr")

### Strings
library(stringr)
stringr::str_replace()
stringr::unite()
detach("package:stringr")

### Dates
#### Format string as date
#### see (*12)
#### as.Date() operates also with numeric instead of character
date_in_date_format <- as.Date(date_in_string_format,  origin = "1899-12-30")

library(lubridate)
lubridate::ymd(dataset$dateString)
detach("package:lubridate")

#### Extract year from date
#### see (*13)
year <- as.POSIXlt(date_in_date_format)$year+1900

## 4. Missing value analysis and handling
### Missing value analysis
### see (*3)
any(is.na())          # Showing if observations contain NA
sum(is.na(variable))  # Show number of observations system missing values
sum(variable == -999, na.rm = TRUE)  # Count the occurrence of -999 (omit NA)
sum(variable %in% c(-998,-999))      # Count multiple user missing values

summary()                # Check summary result for NA
plot(variable)           # Spot missings graphically
table(factor(variable))  # Spot missings in contingency table

sum(!complete.cases(dataset))        # Count complete cases
which(!complete.cases(dataset))      # Show incomplete cases

cases_with_na <- which(is.na(dataset$variable)) # find indices of cases with NA
dataset[cases_with_na, ] # Look at the full rows for records having missings

#### Missing value handling
##### Replace missing value in new variable
dataset1$new_variable <- dataset1$variable_with_na
dataset$new_variable[cases_with_na] <- new_value

##### Replace missing value in new dataset
dataset2 <- dataset1
dataset2$variable_with_na[cases_with_na] <- new_value

gender <- c("MALE","FEMALE","FEMALE","UNKNOWN","MALE")
###### Missing values (and all values not explicitly recoded) 
###### in "gender" will be recoded to "3"
gender_recoded <- ifelse(gender == "MALE", 1, ifelse(gender == "FEMALE", 2, 3))
gender_recoded
# [1] 1 2 2 3 1

##### Recode missing values
##### see (*3)
variable[variable == -999] = NA           # Recode all -999 as NA
variable[is.na(x)] = -999                 # Recode all NA in x as -999
variable[variable %in% c(-998,-999)] = NA # Recode any -998 or -999 as NA
variable[variable %in% -990:-999] = 0     # Recode any value between -990 and
# -999 as 0

##### Select cases if variables have missing values (*20)
cust.id <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
             13, 14, 15, 16, 17, 18, 19, 20)

closed.mdm <- c("01", NA, NA, NA, "08", "07", NA, NA, "05",
                NA, NA, NA, "04", NA, NA, NA, NA, NA, NA, NA)

closed.sls <- c(NA, "08", NA, NA, "08", "07", NA, NA, NA, NA,
                "03", NA, NA, NA, "05", NA, NA, NA, NA, NA)

df <- data.frame(cust.id, closed.mdm, closed.sls,
                 stringsAsFactors=FALSE)
####### Create new variable
df$opcl <- ifelse( is.na(closed.mdm) & is.na(closed.sls) ,
                   'open','closed')
####### Filter on new variable
subset(df, opcl == 'open')

####### If you want to operate directly on one of the 'closed' column, perhaps
####### these examples will help
####### Does not work due to the NAs
df[ df$closed.sls == '08',]
####### Works
subset(df, closed.sls=='08')
####### Works also
df[!is.na(df$closed.sls) & df$closed.sls == '08' , ]

###### Select cases if the variables have missing values
###### 
cust.id <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
             13, 14, 15, 16, 17, 18, 19, 20)

closed.mdm <- c("01", NA, NA, NA, "08", "07", NA, NA, "05",
                NA, NA, NA, "04", NA, NA, NA, NA, NA, NA, NA)

closed.sls <- c(NA, "08", NA, NA, "08", "07", NA, NA, NA, NA,
                "03", NA, NA, NA, "05", NA, NA, NA, NA, NA)

ds_temp <- data.frame(cust.id, closed.mdm, closed.sls,
                 stringsAsFactors=FALSE)

rowSums(is.na(ds_temp[,2:3]))
###### [1] 1 1 2 2 0 0 2 2 1 2 1 2 1 2 1 2 2 2 2 2

###### Gives you vector of numbers which is equal 2 only if they are both NA. So
ds_temp$open <- rowSums(is.na(ds_temp[,2:3]))==2
###### gives you column which is TRUE if the account is open and FALSE
###### in other situation.
###### You can use similar approach for testing the state of account closing.

#### Keep only cases without any missings
dataset2 <- dataset1[complete.cases(dataset1), ]
dataset2 <- na.omit(dataset1)

# Attention:
# Watch for symbols for missing values from
# a) external sources:
#    #N/A (Excel)
#    .    (SPSS/SAS)
#    empty string
# b) internal sources
#    Inf (infinite value, e. g. 1/0)
#    NaN (not a number, e. g. 0/0)

# Attention:
# If a factor is recoded values in the data which are not defined as levels
# will be transformed to NA. (*21)

## 5. Identify and correct errors
### Outliers
summary(dataset)
hist(dataset)
boxplot(dataset)

cases_with_data_error <- which(dataset$variable == error_value)
dataset[cases_with_data_error, ]
dataset$variable[cases_with_data_error] <- correct_value

### Identify and eliminate duplicates (*18)
#### Identify duplicates
key_variables <- c("variable_1", "variable_2", "variable_3", ...)
duplicates <- duplicated(dataset1[ , key_variables])
#### Report on duplicates
print(dataset1[duplicates , ])
head(dataset1[duplicates , ])
#### Eliminate duplicates
dataset2 <- dataset1[!duplicates]

## 6. Visualize the results of data cleaning
summary(dataset)
head(dataset, n = 20)
hist(dataset)
plot(dataset$variable1, dataset$variable2)
boxplot(dataset$variable)

# Special operations
## Operate only on variable of a specific class, e.g. character, numeric
## This works only for data transformations and is not applicable for
## statistical function like head(), sum(), mean(), etc.
## See an alternative for statistical functions in Section "Prepare" below.
## See (*22).
for (i in 1:ncol(dataset)) {
  if (inherits(x = dataset[, i], "character|numeric|factor|or whatsoever") {
    dataset[, i] <- as.factor(dataset[, i])
  }
}

## Add row numbers as a variable to the dataset
dataset$case_id <- 1:nrow(dataset)

# Check
xlsx_check <- "Projectname_Checks.xlsx"
wb <- createWorkbook()
addWorksheet(wb,
             sheetName = "Cleaning")
writeData(wb,
          sheet = "Cleaning",
          x = dataset)
saveWorkbook(wb, 
             file = xlsxCheck,
             overwrite = FALSE)

# Create Recovery Point
path <- file.path("Path", "to", "file", "Projectname_Cleaning.RData")
save(dataset, file = path)

#---------#
# Prepare #
#---------#
# Recover data from previous section
path <- file.path("Path", "to", "file", "Projectname_Cleaning.RData")
load(file = path)

# Restructuring
library(tidyr)
dataset <- tidyr::spread()
dataset <- tidyr::gather()

library(reshape2)
dataset <- reshape2::melt()
dataset <- reshape2::dcast()

# Delete variables
reduced_dataset <- dataset$variable <- NULL # delete a variable
reduced_dataset <- dataset[, -c(index_of_variables)] # delete variables
dataset$variable <- NULL  # deletes just the given variable

# Create new variables
dataset$new_variable <- NA
dataset2 <- data.frame(dataset1, new_variable)
dataset2 <- cbind(dataset, new_variable)

# Rename a single variable using standard R functions (see (*14))
names(dataset)[names(dataset) == "old_variable_name"] <- "new_variable_name"

# Rename a single variable using reshape2::rename (see (*15))
library("reshape2")
dataset <- rename(dataset, c(old_varname_1 = "new_varname_1",
  old_varname_2 = "new_varname2"))
detach(package = "reshape2")

# Recode 1: Manually
## see (*2) p. 87ff
dataset$recoded <- (dataset$variable <=17) * 1 +
                   (dataset$variable >18 & dataset$variable <= 30) * 2 + 
                   (dataset$variable >30 & dataset$variable <= 65) * 3 +
                   (dataset$variable <65) * 4
## see (*1) p. 378ff
dataset$recoded <- factor(dataset$recoded,
                          levels = c(1, 2, 3, 4),
                          labels = c("Pupils", "Young Professionals",
                                     "Professionals", "Retired"))

# Recode 2: Manually
# Recode the value of a single case and assign a new value to a selected
# variable
dataset[dataset["varname_1"] == "9999", "varname_2"] <- 0

# Filter out cases (see (*15) and (*16))
# Never use subset cause use is not save (see (*17))
# filter_var is class logical
## Create filter
### WITH NA
filter_var <- dataset1[(data["variable_1"] > 2) | (data["variable_2"] < 4), ]
### see (*19)
filter_var <- dataset1[ , "variable_1", drop = FALSE] %in% c("item_1", "item_2", ...) # character
filter_var <- dataset1[ , "variable_1", drop = FALSE] %in% c(item_1, item_2, ...) # numerics

### WITHOUT NA
dataset2 <- dataset1[ which(dataset1["variable_1"] == condition 1) , ]
filter_var <- which(dataset["variable_1"] == logical_condition_1 &
    dataset["variable_2"] == logical_condition_2)
dataset2 <- dataset1[filter_var, ]

## Filter out not needed cases
dataset2 <- dataset[filter_var, ]

## Filter
## dplyr war written by Hadley Wickham: should be save to use
library(dplyr)
filter(dataset, v1 < 0.5 | v2 == "g")
detach(package = "dplyr")

library(sqldf)
sqldf('SELECT *
      FROM dataset
      WHERE v1 < 0.5 OR v2 = "g"')
detach(package = "sqldf")

## Filter duplicates and correpsonding non-duplicates
### To filter duplicates and their corresponding non-duplicates use the
### following code snippet:
Debitor <- c("968691", "968691", "968691",
             "A04046", "A04046",
             "L0006", "L0006", "L0006",
             "L0023", "L0023",
             "L0056", "L0056",
             "L0094", "L0094", "L0094",
             "L0124", "L0124",
             "L0143", 
             "L0170",
             "13459",
             "473908",
             "394704",
             "4711",
             "4712",
             "4713")
Debitor <- as.character(Debitor)
var1 <- c(11, 12, 13,
          14, 14,
          12, 13, 14,
          10, 11,
          12, 12,
          12, 12, 12,
          15, 17,
          11,
          14,
          12,
          17,
          13,
          15,
          16,
          11)
ds_example <- data.frame(Debitor, var1)
ds_example$case_id <- 1:nrow(ds_example)
ds_example <- ds_example[, sort(colnames(ds_example))]
ds_example

# This task is to generate a data frame that contains the duplicates AND the
# corresponding non-duplicates to the duplicates.
# For example, finding the duplicates will deliver case 2 and 3 but the list
# should also contain case 1 because case 1 is the corresponding case to the
# duplicate cases 2 and 3.
# For the whole example dataset that would be:
needed <- c(1, 1, 1,
            1, 1,
            1, 1, 1,
            1, 1,
            1, 1,
            1, 1, 1,
            1, 1,
            0, 0, 0, 0, 0, 0, 0, 0)
needed <- as.logical(needed)
ds_example <- data.frame(ds_example, needed)
ds_example

# To find the duplicates and the corresponding non-duplicates
duplicates <- duplicated(ds_example$Debitor)

list_of_duplicated_debitors <- as.character(ds_example[duplicates, "Debitor"])

filter_variable <- unique(list_of_duplicated_debitors)

### Wrong code. Do not run.
### ds_duplicates <- ds_example["Debitor" == filter_variable]  # Result: dataset with 0 columns
### duplicates_and_correponding_non_duplicates <- ds_example["Debitor"] %in% filter_variable  # Result: FALSE

duplicates_and_correponding_non_duplicates <- ds_example$Debitor %in% filter_variable  # Result: OK
duplicates_and_correponding_non_duplicates <- ds_example[, "Debitor"] %in% filter_variable  # Result: OK

### Create the dataset with duplicates and corresponding non-duplicates
ds_example <- ds_example[duplicates_and_correponding_non_duplicates, ]
ds_example

# Handling of character vectors
# see (*4)
# x = variable
tolower(x)                   # converts x to all lower case 
toupper(x)                   # converts x to all upper case 
nchar(x)                     # a vector of the lengths of each value
paste(a,b,sep="_")           # concatenates character values
substr(x,start,stop)         # extract characters from positions start to stop
strsplit(x,split)            # split each value of x into a list of strings 
                             # using split as the delimiter 
grep(pattern,x)              # return a vector of the elements that included
                             # pattern 
grepl(pattern,x)             # returns a logical vector indicating whether 
                             # each element of x contained pattern 
regexpr(pattern,x)           # returns the integer positions of the first
                             # occurrence of pattern in each element of x 
gregexpr(pattern,x)          # returns a list of the integer positions of all
                             # of the occurrences of pattern in each value of x 
gsub(pattern,replacement,x)  # replaces each occurrence of pattern with occurrence 

# see (*5)
match()                      # compares two vectors, can be also numeric
pmatch()                     # compares parts of two vectors, can be also numeric

# Creating variables on the fly
# see (*7)
str(Kunden01)

for (year in 2011:2015) {
  Reeller_Kunde <- paste0("Reeller_Kunde_", year)
  Umsatz <- paste0("Umsatz_", year)
  cat('Creating', Reeller_Kunde,'from', Umsatz,'\n')
  Kunden01[[ Reeller_Kunde ]] <- ifelse( Kunden01[[ Umsatz ]] <= 0, 1, 2)
  Kunden01[[ Reeller_Kunde ]] <- factor( Kunden01[[ Reeller_Kunde ]],
                                         levels=c(1,2),
                                         labels= c("NICHT kaufend", "kaufend")
  )
}

str(Kunden01)

# Sort dataset
dataset2 <- dataset[order(dataset$var_1_to_be_sorted_by, dataset$var_2_to_be_sorted_by), ]

# Save data
## As R data file
path <- file.path("path", "to", "filename")
save.image(path)

## As Excel file
library(XLConnect)

### Create a workbook object
path <- file.path("path", "to", "filename")

book <- loadWorkbook(filename = path)

df <- data.frame("a data.frame")
createSheet(book, "sheet name")
writeWorksheet(book, df, "sheet_name")
saveWorkbook(book, file = path)

detach("packages:XLConnect")

# Check
xlsx_check <- "Projectname_Checks.xlsx"
wb <- createWorkbook()
addWorksheet(wb,
             sheetName = "Preparation")
writeData(wb,
          sheet = "Cleaning",
          x = dataset)
saveWorkbook(wb, 
             file = xlsxCheck,
             overwrite = FALSE)

# Create Recovery Point
path <- file.path("Path", "to", "file", "Projectname_Preparation.RData")
save(dataset, file = path)

#---------#
# Analyse #
#---------#
# Recover data from previous section
path <- file.path("Path", "to", "file", "Projectname_Preparation.RData")
load(file = path)

# Simple frequency table
tbl <- as.data.frame(table(dataset))
print(tbl)

library(dplyr)

# Combination of group_by() and mutate() creates new variables
# within each group.
# see (*8)
# If mutate() uses the rank() function within-group rankings
# are calculated.
# Example:
# Filter ArrDelay, group by carrier, create a mean by carrier,
# rank this new mean and then sort the carriers based on the
# ranking.
# Combination of arrange() and rank() ranks the values within-groups
# from the largest to the smallest.
dataset %>%
  filter(!is.nat(var1) & var1 > 0) %>%
  group_by(var2) %>%
  summarise(avg = mean(var1)) %>%
  mutate(rank = rank(avg)) %>%
  arrange(rank)

detach("package:dplr")

#---------#
# Special #
#---------#
# Extract data for communication with R-Help mailing list (*11)
dput(dataset[firstRow:lastRow, firstVariable:lastVariable])

# Analyse a factor vector with empty levels
var1 <- c(1, 1, 2, 2)
var1 <- factor(var1,
               levels = c(1, 2, 3, 4),
               labels = c("Used Level 1", "Used Level 2",
                          "Empty Level 3", "Empty Level 4"))
summary(var1[ , drop = TRUE])

ds_example <- data.frame(var1)
summary(ds_example[["var1"]], drop = TRUE)  # Attention: [[]] is used!

# Detach library and unload it (*23)
detach("package:dplyr", unload = TRUE)

# References
# (*1)  Muenchen: R for SAS and SPSS Users, 2. Ed., New York, 2011
# (*2)  Hain: Statistik mit R, 1. Ed., Hannover, 2011
# (*3)  Allerhand: R Programming, Essential Functions, Missing Values
#       (http://forums.psy.ed.ac.uk/R/P01582/essential-10/)
# (*4)  Philippi: Data Manipulation in R
#       (http://science.nature.nps.gov/im/datamgmt/statistics/r/fundamentals/manipulation.cfm)
# (*5)  Spector: Introduction to R
#       (https://www.stat.berkeley.edu/~spector/Rcourse.pdf)
# (*6)  Manderscheid: Sozialwissenschaftliche Datenanalyse mit R, 1. Aufl.,
#       Wiesbaden, 2012
# (*7)  MacQueen: Creating Variables on the Fly
#       (http://r.789695.n4.nabble.com/Creating-variables-on-the-fly-td4720034.html)
# (*8)  Grolemund: Data Manipulation in R with dplyr in: Datacamp.com
#       (https://www.datacamp.com/courses/dplyr-data-manipulation-r-tutorial)
# (*9)  Carchedi: Cleaning Data in R, in: Datacamp.com
#       (https://www.datacamp.com/courses/cleaning-data-in-r)
# (*10) Schouwenaars: Importing Data into R, in: Datacamp.com
#       (https://www.datacamp.com/courses/importing-data-into-r) 
# (*11) Joshua Wiley-2 on r-help@r-project.org
#       http://r.789695.n4.nabble.com/If-find-x-then-y-else-do-nothing-td3722300.html
# (*12) Dan J. Nordlund on r-help@r-project.org
#       http://r.789695.n4.nabble.com/Convert-number-to-Date-td1691251.html
# (*13) Brian Ripley on r-help@r-project.org
#       http://r.789695.n4.nabble.com/Extracting-year-from-a-date-object-td906915.html
# (*14) Muenchen: R for SAS and SPSS Users, 2. Ed., New York, 2011, p. 262f
# (*15) Hain: Statistik mit R, 1. Aufl, Hannover, 2011, p. 80f
# (*16) mpalanco:
#       http://stackoverflow.com/questions/4935479/how-to-combine-multiple-conditions-to-subset-a-data-frame-using-or
# (*17) Hadley Wickham: Variable passed to function not used in function in select=... in subset
#       http://r.789695.n4.nabble.com/Variable-passed-to-function-not-used-in-function-in-select-in-subset-tt872217.html
# (*18) Muenchen: R for SAS and SPSS Users, 2. Ed., New York, 2011, p. 311
# (*19) Cookbook R: Reordering the columns in a data frame
#       http://www.cookbook-r.com/Manipulating_data/Reordering_the_columns_in_a_data_frame/
# (*20) http://r.789695.n4.nabble.com/Subscripting-problem-with-is-na-tp4722082p4722112.html
#       https://www.mail-archive.com/r-help@r-project.org/msg236476.html
# (*21) http://r.789695.n4.nabble.com/Interdependencies-of-variable-types-logical-expressions-and-NA-td4720183.html
# (*22) http://r.789695.n4.nabble.com/selecting-columns-from-a-data-frame-or-data-table-by-type-ie-numeric-integer-td4720240.html
# (*23) http://stackoverflow.com/questions/6979917/how-to-unload-a-package-without-restarting-r
# (*24) Hain: Statistik mit R, 1. Ed., Hannover, 2011, p. 83
# EOF