forked from mhahsler/Introduction_to_Data_Mining_R_Examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchap2_exploring.R
205 lines (161 loc) · 6.84 KB
/
chap2_exploring.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#' ---
#' title: "R Code for Chapter 2 of Introduction to Data Mining: Exploring Data (with ggplot2)"
#' author: "Michael Hahsler"
#' output:
#' html_document:
#' toc: true
#' ---
#' This is additional code related to chapter 2 of _"Introduction to Data Mining"_
#' by Pang-Ning Tan, Michael Steinbach and Vipin Kumar.
#' __See [table of contents](https://github.com/mhahsler/Introduction_to_Data_Mining_R_Examples#readme) for code examples for other chapters.__
#'
#' 
#' This work is licensed under the
#' [Creative Commons Attribution 4.0 International License](http://creativecommons.org/licenses/by/4.0/). For questions please contact
#' [Michael Hahsler](http://michael.hahsler.net).
#'
#' # ggplot2
#'
#' This code uses `tidyverse` for data preparation and `ggplot2` for most visualizations.
library(tidyverse)
library(ggplot2)
#'
#' The gg in `ggplot2` stands for [__grammar of graphics__](https://www.springer.com/statistics/computational/book/978-0-387-24544-7).
#' The idea is that every graph is built from the same basic components:
#'
#' - the data,
#' - a coordinate system, and
#' - visual marks representing the data (geoms).
#'
#' In `ggplot2`, the components are combined using the `+` operator.
#'
#' > `ggplot(data, mapping = aes(x = ..., y = ..., color = ...)) +`
#' > `geom_point() +`
#' > `coord_cartesian()`
#'
#' Each `geom_X` uses a `stat_Y` function to calculate what is visualizes. For example,
#' `geom_bar` uses `stat_count` to create a bar chart by counting how often each value appears in the data (see `? geom_bar`). `geom_point` just uses the stat `"identity"` to display the points using the coordinates as they are.
#'
#' RStudio's [Data Visualization Cheat Sheet](https://github.com/rstudio/cheatsheets/raw/master/data-visualization-2.1.pdf) offers a comprehensive overview of available components. A good introduction
#' can be found in the [Chapter on Data Visualization](https://r4ds.had.co.nz/data-visualisation.html) of the free book [R for Data Science](https://r4ds.had.co.nz).
#'
#' # Basic statistics
#'
#' Load the iris data set and convert the data.frame into a tidyerse tibble (optional)
data(iris)
iris <- as_tibble(iris)
iris
#'
#' Get summary statistics
summary(iris)
#' Get mean and standard deviation for sepal length
iris %>% pull(Sepal.Length) %>% mean()
iris %>% pull(Sepal.Length) %>% sd()
#' Ignore missing values (Note: this data does not contain any, but this is
#' what you would do)
iris %>% pull(Sepal.Length) %>% mean(na.rm = TRUE)
#' Robust mean (trim 10% of observations from each end of the distribution)
iris %>% pull(Sepal.Length) %>% mean(trim = .1)
#' Calculate a summary for all numeric columns
iris %>% summarize_if(is.numeric, mean)
iris %>% summarize_if(is.numeric, sd)
iris %>% summarize_if(is.numeric, list(min = min, median = median, max = max))
#' MAD (median absolute deviation)
iris %>% summarize_if(is.numeric, mad)
#' # Tabulate data
#'
#' Count the different species.
iris %>% count(Species)
#' Discretize the data first since there are too many values (cut divides the range by breaks, see package discretization for other methods)
iris_discrete <- iris %>% mutate_if(is.numeric,
function(x) cut(x, 3, labels = c("short", "medium", "long"), ordered = TRUE))
iris_discrete
summary(iris_discrete)
#' Create some tables (creating tables is a little harder using tidyverse)
iris_discrete %>% select(Sepal.Length, Sepal.Width) %>% table()
iris_discrete %>% select(Petal.Length, Petal.Width) %>% table()
iris_discrete %>% select(Petal.Length, Species) %>% table()
#table(iris_discrete)
#' Test if the two features are independent given the counts in the
#' contingency table (H0: independence)
#'
#' p-value: the probability of seeing a more extreme value of the test
#' statistic under the assumption that H0 is correct. Low p-values (typically
#' less than .05 or .01) indicate that H0 should be rejected.
tbl <- iris_discrete %>% select(Sepal.Length, Sepal.Width) %>% table()
tbl
chisq.test(tbl)
#' Fisher's exact test is better for small counts (cells with counts <5)
fisher.test(tbl)
#' Plot the distribution for a discrete variable
iris_discrete %>% pull(Sepal.Length) %>% table()
ggplot(iris_discrete, aes(Sepal.Length)) + geom_bar()
#' # Percentiles
iris %>% pull(Petal.Length) %>% quantile()
#' Interquartile range
quart <- iris %>% pull(Petal.Length) %>% quantile()
quart[4] - quart[2]
#' # Visualizations
#' ### Histogram
#'
#' Show the distribution of a single numeric variable
ggplot(iris, aes(Petal.Width)) + geom_histogram(bins = 20)
#' ### Scatter plot
#'
#' Show the relationship between two numeric variables
ggplot(iris, aes(x = Petal.Length, y = Petal.Width, color = Species)) + geom_point()
#' ### Scatter plot matrix
#'
#' Show the relationship between several numeric variables
library("GGally")
ggpairs(iris, aes(color=Species))
#' ### Boxplot
#'
#' Compare the distribution of several continuous variables
ggplot(iris, aes(Species, Sepal.Length)) + geom_boxplot()
#' Group-wise averages
iris %>% group_by(Species) %>% summarize_if(is.numeric, mean)
#' ### ECDF: Empirical Cumulative Distribution Function
e <- iris %>% pull(Petal.Length) %>% ecdf()
e
ggplot(iris, aes(Petal.Width)) + stat_ecdf()
#' ### Data matrix visualization
ggplot(iris %>% mutate(id = row_number()) %>% pivot_longer(cols = 1:4),
aes(x = name, y = id, fill = value)) + geom_tile() +
scale_fill_viridis_c()
#' values smaller than the average are blue and larger ones are red
iris_scaled <- scale(iris %>% select(-Species))
ggplot(as_tibble(iris_scaled) %>% mutate(id = row_number()) %>% pivot_longer(cols = 1:4),
aes(x = name, y = id, fill = value)) + geom_tile() +
scale_fill_gradient2()
#' Reorder
library(seriation)
o <- seriate(iris_scaled)
iris_ordered <- permute(iris_scaled, o)
ggplot(as_tibble(iris_ordered) %>% mutate(id = row_number()) %>% pivot_longer(cols = 1:4),
aes(x = name, y = id, fill = value)) + geom_tile() +
scale_fill_gradient2()
#' ### Correlation matrix
#'
#' Calculate and visualize the correlation between features
cm1 <- iris %>% select(-Species) %>% as.matrix %>% cor()
cm1
library(ggcorrplot)
ggcorrplot(cm1)
#' use hmap from package seriation
hmap(cm1, margin = c(7,7), cexRow = 1, cexCol = 1)
#' Test if correlation is significantly different from 0
cor.test(iris$Sepal.Length, iris$Sepal.Width)
cor.test(iris$Petal.Length, iris$Petal.Width) #this one is significant
#' Correlation between objects
cm2 <- iris %>% select(-Species) %>% as.matrix() %>% t() %>% cor()
ggcorrplot(cm2)
#' ### Parallel coordinates plot
library(GGally)
ggparcoord(as_tibble(iris), columns = 1:4, groupColumn = 5)
#' Reorder with placing correlated features next to each other
library(seriation)
o <- seriate(as.dist(1-cor(iris[,1:4])), method="BBURCG")
get_order(o)
ggparcoord(as_tibble(iris), columns = get_order(o), groupColumn = 5)
#' Look at https://www.r-graph-gallery.com/ for many example graphs.