-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathText Analysis 10.Rmd
146 lines (96 loc) · 3.43 KB
/
Text Analysis 10.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
---
title: "Text analysis 10"
author: "Junyan Yao"
date: "12/17/2017"
output: html_document
---
```{r}
library(corpus)
library(Matrix)
library(tidytext)
library(ggplot2)
library(dplyr)
library(qdap)
setwd("/Users/YaoJunyan/Documents/Text-Analysis")
data<-read.csv("~/Documents/NYU/Fall 2017/Text Analysis Project/cpsv_text_project/chat_time_series.csv")
#data<- read.csv("C:/Users/jyao/Documents/Text Analysis/chat_time_series.csv") office comp
data<- data[,c(2,5,8)] #extract needed columns
#subset the data
chatdata<- data[which(data$type=="chat"),] #this is what we want to look at for now
problemdata<- data[which(data$type=="problem"),]
#load the outcome data
outcomedata<-read.csv("~/Documents/NYU/Fall 2017/Text Analysis Project/cpsv_text_project/group_outcomes.csv")
#outcomedata<-read.csv("C:/Users/jyao/Documents/Text Analysis/group_outcomes.csv") #office comp
cwords <- read_excel("~/Desktop/Text-Analysis/cwords.xlsx")
subset1<- outcomedata[outcomedata$group_id>0,] #Get rid of all negative group_id
summary(subset1$delta) #we have 110 groups in this dataset
performance<-ifelse(subset1$delta>0.4058,"high",ifelse(subset1$delta< -0.481,"low","in-between"))
temp22<-cbind(subset1,performance) #label the performance for these groups
#try to get rid of the missing rows(some group id are missing in the outcome data)
merged_data<- merge(x=chatdata,y=temp22,by="group_id")
merged_data$text<-as.character(merged_data$content)
ques<-read.csv("problem/df3.csv")
#remove NA
ques<-ques[1:113,1:2]
```
```{r}
que2<-term_stats(ques$question, drop=stopwords_en, drop_punct=TRUE)
#drop <p>
que2<- que2[which(que2$term !=">"),]
que2<- que2[which(que2$term !="<"),]
que2<- que2[which(que2$term !="p"),]
Y<- term_stats(chatdata$content, drop=stopwords_en, drop_punct=TRUE)
```
Join the question lists to the chat list
```{r}
D<- rbind(Y, que2)
```
```{r}
subset1<- outcomedata[outcomedata$group_id>0,] #Get rid of all negative group_id
performance<-ifelse(subset1$delta>0.4058,"high",ifelse(subset1$delta< -0.481,"low","in-between"))
temp22<-cbind(subset1,performance)
#try to get rid of the missing rows(some group id are missing in the outcome data)
merged_data<- merge(x=chatdata,y=temp22,by="group_id")
merged_data$text<-as.character(merged_data$content)
```
low frequency words appeared in questions
```{r}
que3<- que2[which(que2$count< 3),]
```
Grammar/Typo correction
```{r}
cwords <- read_excel("~/Desktop/Text-Analysis/cwords.xlsx")
dim(cwords)
names(cwords)<-c("original","term")
wordz<- cwords %>% left_join(que3, by=c("term","term"))
wordz$Q<- wordz$term
wordz$Q[which(wordz$count>=1)]<- "Qword"
#remove NA's and duplicates
wordz<-wordz[!is.na(wordz$Q),]
wordz<- wordz[!duplicated(wordz$term),]
```
Replace all the Qword and correct the grammar in the corpus
#don't run, not working
```{r}
merged_data$sentences<- mgsub(wordz$original,wordz$Q, merged_data$content, leadspace = TRUE, trailspace = TRUE)
T<- vector("list", 14007)
for (i in 1: (length(merged_data$content))){
T[[i]]<-term_stats(merged_data$content[i])
}
for (i in 1:(length(merged_data$content))){
T[[i]][1]<- mgsub(wordz$original,wordz$Q, T[[i]][1])
}
```
```{r}
TermByGroup<- merged_data %>%
unnest_tokens(word, text) %>%
count(group_id,word, sort = TRUE) %>%
ungroup()
tot<- TermByGroup %>%
group_by(group_id) %>%
summarize(total=sum(n))
TermByGroup<- left_join(TermByGroup, tot)
TermByGroup<- TermByGroup %>%
bind_tf_idf(word,group_id, n)
View(TermByGroup)
```