forked from TATABOX42/text-mining-in-r
-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_mining_in_r.R
284 lines (242 loc) · 9.13 KB
/
text_mining_in_r.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
# ***********************************************
# ***********************************************
# Author: Benjamin Tovar
# Date: April 25, 2015
# Post: http://btovar.com/2015/04/text-mining-in-r/
# ***********************************************
# ***********************************************
library(reshape)
library(ggplot2)
library(tm)
library(wordcloud)
# ****************************
# load data and labels
# ***************************
# set input files
dataset_file <- "dataset.txt"
labels_file <- "labels.txt"
# read file
dataset <- read.delim(dataset_file,header=FALSE)
# read labels
dataset_labels <- read.delim(labels_file,header=FALSE)
dataset_labels <- dataset_labels[,1]
# add word "class_" for each label
dataset_labels_p <- paste("class",dataset_labels,sep="_")
# ****************************
# print and plot the number
# of samples for each class
# ***************************
# > table(dataset_labels_p)
# dataset_labels_p
# class_1 class_2 class_3 class_4 class_5 class_6 class_7 class_8
# 2840 1596 251 108 41 253 190 206
pdf("barplot_sample_number_per_class.pdf",height=6,width=8)
ggplot(as.data.frame(table(dataset_labels_p))) +
aes(x=dataset_labels_p,y=Freq) +
geom_bar(stat="identity",aes(fill=dataset_labels_p),alpha=0.7) +
theme(legend.position="none") +
ylim(0,3000) +
labs(title="Sample number for each document class",
x="Class",y="Number of samples")
dev.off()
# ****************************
# Pre-process data
# ***************************
# get unique class name list
unique_labels <- unique(dataset_labels_p)
# merge documents that match
# certain class into a list object
dataset_s <- sapply(unique_labels,function(label) list( dataset[dataset_labels_p %in% label,1] ) )
# double check that we are including each document into each list index
as.data.frame(lapply(dataset_s,length))
# class_1 class_2 class_3 class_4 class_5 class_6 class_7 class_8
# 1 2840 1596 251 108 41 253 190 206
# ****************************
# Compute document corpus
# to make "text mining"
# ***************************
# convert each list content into a corpus
dataset_corpus <- lapply(dataset_s, function(x) Corpus(VectorSource( toString(x) )))
# merge all documents into one single corpus
dataset_corpus_all <- dataset_corpus[[1]]
for (i in 2:length(unique_labels)) {
dataset_corpus_all <- c(dataset_corpus_all,dataset_corpus[[i]])
}
# > dataset_corpus_all
# <<VCorpus (documents: 8, metadata (corpus/indexed): 0/0)>>
# > summary(dataset_corpus_all)
# Length Class Mode
# 1 2 PlainTextDocument list
# 1 2 PlainTextDocument list
# 1 2 PlainTextDocument list
# 1 2 PlainTextDocument list
# 1 2 PlainTextDocument list
# 1 2 PlainTextDocument list
# 1 2 PlainTextDocument list
# 1 2 PlainTextDocument list
# ****************************
# text pre-processing
# ***************************
# remove punctuation
dataset_corpus_all <- tm_map(dataset_corpus_all, removePunctuation)
# remove numbers
dataset_corpus_all <- tm_map(dataset_corpus_all, removeNumbers)
# remove stopwords
dataset_corpus_all <- tm_map(dataset_corpus_all, function(x) removeWords(x,stopwords("english")))
# set a list of words to remove
# which I found out appears many times
# in all documents, but does not say "much"
# about document class.
# feel free to comment next lines
words_to_remove <- c("said","from","what","told","over","more","other","have","last",
"with","this","that","such","when","been","says","will","also",
"where","why","would","today")
dataset_corpus_all <- tm_map(dataset_corpus_all, removeWords, words_to_remove)
# ****************************
# compute term matrices M[i,j]
# where:
# [i,j] <- frequency of word i in document class j
# ***************************
# compute term matrix
document_tm <- TermDocumentMatrix(dataset_corpus_all)
# convert to matrix class
document_tm_mat <- as.matrix(document_tm)
colnames(document_tm_mat) <- unique_labels
# compute dimensions of matrix
dim(document_tm_mat)
# ****************************
# For quality control, remove
# sparse terms
# ***************************
# FROM: ?removeSparseTerms
#
# Usage:
#
# removeSparseTerms(x, sparse)
#
# Arguments:
#
# x: A ‘DocumentTermMatrix’ or a ‘TermDocumentMatrix’.
#
# sparse: A numeric for the maximal allowed sparsity in the range from
# bigger zero to smaller one.
#
# Value:
#
# A term-document matrix where those terms from ‘x’ are removed
# which have at least a ‘sparse’ percentage of empty (i.e., terms
# occurring 0 times in a document) elements. I.e., the resulting
# matrix contains only terms with a sparse factor of less than
# ‘sparse’.
document_tm_clean <- removeSparseTerms(document_tm, 0.8)
# convert to matrix class
document_tm_clean_mat <- as.matrix(document_tm_clean)
colnames(document_tm_clean_mat) <- unique_labels
# compute dimensions of matrix
dim(document_tm_clean_mat)
# ****************************
# For extra quality control
# remove words in term matrix
# with length < 4
# ***************************
# compute row index
index <- as.logical(sapply(rownames(document_tm_clean_mat), function(x) (nchar(x)>3) ))
# extract new term matrix
document_tm_clean_mat_s <- document_tm_clean_mat[index,]
# compute dimensions of matrix
dim(document_tm_clean_mat_s)
# ****************************
# compute comparison
# word clouds
# ***************************
# FROM: ?comparison.cloud
#
# Let p_{i,j} be the rate at which word i occurs in document j, and
# p_j be the average across documents(sum_ip_{i,j}/ndocs). The size
# of each word is mapped to its maximum deviation (
# max_i(p_{i,j}-p_j) ), and its angular position is determined by
# the document where that maximum occurs.
# first top 500 discriminant words
pdf("comparison_cloud_top_500_words.pdf",height=8,width=8)
comparison.cloud(document_tm_clean_mat_s,
max.words=500,
random.order=FALSE,c(4,0.4),
title.size=1.4)
dev.off()
# first top 2000 discriminant words
pdf("comparison_cloud_top_2000_words.pdf",height=8,width=8)
comparison.cloud(document_tm_clean_mat_s,
max.words=2000,
random.order=FALSE,c(4,0.4),
title.size=1.4)
dev.off()
# ****************************
# compute commonality
# word clouds
# ***************************
# FROM: ?commonality.cloud
# Plot a cloud of words shared across documents
# first top 2000 common words across classes
pdf("commonality_cloud_top_2000_words.pdf",height=5,width=5)
commonality.cloud(document_tm_clean_mat_s,
max.words=2000,
random.order=FALSE)
dev.off()
# ****************************
# Plot adjusted word frequency
# for each dataset
# ***************************
# NOTE: the adjustment means that:
# word with an adjusted value = 1.0, indicates that
# is the most frequent word in document class.
# The remaining values are adjusted
# in function of the original frequency of top word.
# adjust by max value found for each document class
document_tm_clean_mat_s_norm <- apply(document_tm_clean_mat_s,2,function(col) col/max(col) )
# > colSums(document_tm_clean_mat_s_norm)
# class_1 class_2 class_3 class_4 class_5 class_6 class_7 class_8
# 24.21316 22.87277 30.76670 29.17125 27.56000 27.43659 32.27465 33.85191
dataset_wf <- melt(document_tm_clean_mat_s_norm)
dataset_wf <- cbind(dataset_wf,
# add colors depending on the score
category=ifelse(dataset_wf$value<=0.25,"0.25",
ifelse(dataset_wf$value<=0.5, "0.5",
ifelse(dataset_wf$value<=0.75, "0.75",
ifelse(dataset_wf$value<=1, "1.0","lol"))))
)
# > dataset_wf[1:5,]
# Terms Docs value category
# 1 abandon class_1 0.000000000 0.25
# 2 abandoned class_1 0.000483910 0.25
# 3 abbey class_1 0.000483910 0.25
# 4 abdel class_1 0.000000000 0.25
# 5 abdul class_1 0.000241955 0.25
# ***************************
# count the number of words
# per category
# ***************************
# > table(dataset_wf$category)
# 0.25 0.5 0.75 1.0
# 58108 74 16 18
# Given that we have 58108 words in 0.25 category.
# it might make sense to "pick" randomly only a sample
# of the full category to help with the visualization
# extract all non 0.25 items
index <- !dataset_wf$category %in% "0.25"
dataset_wf2 <- dataset_wf[index,]
# rbind with 500 samples of 0.25 category items
index <- sample(rownames(dataset_wf[dataset_wf$category %in% "0.25",]),
500,replace=FALSE)
dataset_wf2 <- rbind(dataset_wf2,dataset_wf[index,])
# > table(dataset_wf2$category)
# 0.25 0.5 0.75 1.0
# 500 74 16 18
pdf("adjusted_wf.pdf",width=8,height=6)
ggplot(dataset_wf2) +
aes(x=Docs,y=value) +
geom_text(aes(label=Terms,size=1,colour=category)) +
labs(title="Adjusted frequency of words for each dataset",
x="Document class",y="Adjusted word frequency") +
theme_minimal() +
theme(legend.position="none")
dev.off()