Text-Analytivs/Untitled.R at master · AlexQianYi/Text-Analytivs · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
install.packages("tm")
install.packages("ggplot2")
install.packages("wordcloud")
install.packages("hash")

library(tm)
library(ggplot2)

## corpus of 50 documents
DocData<-data("acq")

########################################
## Q(a) try function in lecture
inspect(acq)
## the length of specific document
test11<-acq[[1]]
test11
## sparsity/ Max length term
ACQdtm<-DocumentTermMatrix(acq)
ACQdtm
## inspect term
inspect(ACQdtm[1:15, 1:6])
## frequency of term
test1tf <- termFreq(test11)
test1tf
## convert to a dataFrame
test1df <- as.data.frame(test1tf)
test1df
## Convert the corpus to lower case
ACQlow<- tm_map(acq, content_transformer(tolower))
ACQlow
## remove anything other than English letters or spaces
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
ACQcl <- tm_map(ACQlow, content_transformer(removeNumPunct))
## remove stop words from the corpus
myStopword <- c(stopwords('english'))
ACQstop <- tm_map(ACQcl, removeWords, myStopword)
inspect(ACQstop[1:2])
## find terms with a frequency of 5 or more
ACQtdm2 <- TermDocumentMatrix(ACQstop, control = list(wordLengths = c(1, Inf)))
ACQtdm2
freq.terms <- findFreqTerms(ACQtdm2, lowfreq = 5)
freq.terms
## find words associated with "states"
findAssocs(ACQtdm2, "states", 0.25)
## term frequency
term.freq <- rowSums(as.matrix(ACQtdm2))
term.freq <- subset(term.freq, term.freq >= 5)
df <- data.frame(term = names(term.freq), freq = term.freq)
term.freq
df


###########################################
## Q(b) use inspect
## 15 largest document
## 50:1068, 47:3013, 44:1022, 42:1607, 36:1043, 34:1465, 29:3109, 25:3516, 22:1873, 20:1009, 19:2457
## 18:871, 7:3635, 4:2308, 1:1287
inspect(acq)

############################################
## Q(c)
## dendrogram
tdm2 <- removeSparseTerms(ACQtdm2, sparse = 0.50)
tdm2
dd <- dist(scale(tdm2), method = "euclidean")
hc <- hclust(dd, method = "ward.D2")
plot(hc)
## WordCloud
m1 <- as.matrix(tdm2)
word.freq <- sort(rowSums(m1), decreasing = T)
word.freq
library(wordcloud)
pal <- brewer.pal(9, "BuGn")
pal <- pal[-(1:4)]
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 3, random.order = F, colors = pal)

######################################################
install.packages("textreuse")
install.packages("wordnet")
install.packages("zipfR")
install.packages("tidyverse")
install.packages("tokenizers")
## see the content of document
as.character(acq[[7]])
## Q(d)
library(textreuse)
library(tidyverse)
library(tokenizers)
## get one of 15 largest documents
docI <- acq[[7]]
charDoc <- as.character(docI)
# print longest word in file
max_word_len = 2
max_word = ""
for (word in tokenize_words(charDoc)){
  print(max_word_len)
  if(nchar(word) > max_word_len){
    max_word = word
    max_word_len = nchar(word)
  }
}
print(max_word)
# print the longest sentence in every file
max_sentence_len = 0
max_sentence = ""
for (line in tokenize_sentences(charDoc)){
  count = 0
  for (words in tokenize_words(line)){
    count=count+1
  }
  print(line)
  if(count > max_len){
    max_sentence = line
    max_sentence_len = count
  }
}
print(max_sentence)

#######################################################
## Q(e)
## draw a table show the length of longest sentence
length_array <- c(15, 16)    ## change length here
length_data <- data.frame(len = length_array[1:2])
mytable <- cbind(sites = c("file 1", "file 2"), length_data[1:2,])   ## change file name here
rownames(mytable) <- c("No1", "No2")

#############################################################
## Q(f)
## remove punctuation
fileNoPun <- tm_map(acq, content_transformer(removeNumPunct))
DocINoPun <- fileNoPun[[7]]
tokenize_sentences(as.character(DocINoPun))


#############################################################
## Q(g)
## print part of speech of every word
library(wordnet)
docI <- acq[[7]]
charDoc <- as.character(docI)
sentences <- tokenize_sentences(charDoc)
sentences_words <- sapply(sentences, tokenize_words)
sentences


##############################################################
## Q(h)
## print word frequency
library(zipfR)
testFre <- termFreq(acq[[7]])
rt_pos = as.data.frame(testFre)
Vm = rt_pos[1]
testFre

####################################################
## search word
library(tm)
library(textreuse)

target = "capital"
file_index = 1
find = 0
result_index = 1
for(i in 1:50){
  File <- acq[[file_index]]
  charFile <- as.character(File)
  line_index = 1
  for(line in tokenize_sentences(charFile)){

    word_index = 1
    for(word in tokenize_words(line)){
      if(word == target){
        print(paste(paste("No.", as.character(result_index)), "result"))
        print(paste(paste("No.", as.character(file_index)), "file"))
        print(paste(paste("No.", as.character(line_index)), "line"))
        print(paste(paste("No.", as.character(word_index)), "word"))
        find = 1
        result_index = result_index + 1
        print('---------------')
      }
      word_index = word_index + 1
    }
    line_index = line_index + 1
  }
  file_index = file_index + 1
}

if(find == 0){
 print("no such word!")
}