Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
mshin77 committed Jan 16, 2024
1 parent 1ee137f commit 38f2b84
Show file tree
Hide file tree
Showing 11 changed files with 69 additions and 70 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Imports:
ggraph,
magrittr,
numform,
plotly,
purrr,
quanteda,
quanteda.textstats,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ importFrom(ggplot2,ggplot)
importFrom(ggplot2,labs)
importFrom(ggplot2,theme_bw)
importFrom(magrittr,"%>%")
importFrom(plotly,ggplotly)
importFrom(rlang,":=")
importFrom(rlang,.data)
importFrom(rlang,.env)
Expand Down
62 changes: 29 additions & 33 deletions R/text_mining_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,10 @@ preprocess_texts <-
#'
#' @examples
#' if(requireNamespace("quanteda")){
#' dfm <- SpecialEduTech %>% preprocess_texts(text_field = "abstract") %>% quanteda::dfm()
#' dfm <- SpecialEduTech %>%
#' preprocess_texts(text_field = "abstract",
#' verbose = FALSE) %>%
#' quanteda::dfm()
#' dfm %>% plot_word_frequency(n = 20)
#' }
#'
Expand Down Expand Up @@ -125,7 +128,10 @@ plot_word_frequency <-
#'
#' @examples
#' if(requireNamespace("quanteda")){
#' dfm <- SpecialEduTech %>% preprocess_texts(text_field = "abstract") %>% quanteda::dfm()
#' dfm <- SpecialEduTech %>%
#' preprocess_texts(text_field = "abstract",
#' verbose = FALSE) %>%
#' quanteda::dfm()
#' dfm %>% extract_frequent_word()
#' }
#'
Expand Down Expand Up @@ -162,7 +168,10 @@ extract_frequent_word <-
#'
#' @examples
#' if(requireNamespace("quanteda", "tidytext")){
#' dfm <- SpecialEduTech %>% preprocess_texts(text_field = "abstract") %>% quanteda::dfm()
#' dfm <- SpecialEduTech %>%
#' preprocess_texts(text_field = "abstract",
#' verbose = FALSE) %>%
#' quanteda::dfm()
#' data <- tidytext::tidy(stm_15, document_names = rownames(dfm), log = FALSE)
#' data %>% plot_topic_term(top_n = 10)
#' }
Expand Down Expand Up @@ -241,7 +250,10 @@ plot_topic_term <-
#'
#' @examples
#' if(requireNamespace("quanteda", "tidytext")){
#' dfm <- SpecialEduTech %>% preprocess_texts(text_field = "abstract") %>% quanteda::dfm()
#' dfm <- SpecialEduTech %>%
#' preprocess_texts(text_field = "abstract",
#' verbose = FALSE) %>%
#' quanteda::dfm()
#' data <- tidytext::tidy(stm_15, document_names = rownames(dfm), log = FALSE)
#' data %>% examine_top_terms(top_n = 5)
#' }
Expand Down Expand Up @@ -275,15 +287,17 @@ examine_top_terms <-
#'
#' @param data A tidy data frame that includes per-document per-topic probabilities (gamma).
#' @param top_n A number of highest per-document per-topic probabilities (number of top_n can be changed).
#' @param topic_names Topic names
#' @param ... Further arguments passed.
#'
#' @export
#' @return A ggplot object output from \code{stm::stm}, \code{tidytext::tidy}, and \code{ggplot2::ggplot}.
#'
#' @examples
#' if(requireNamespace("quanteda", "tidytext")){
#' dfm <- SpecialEduTech %>% preprocess_texts(text_field = "abstract") %>% quanteda::dfm()
#' dfm <- SpecialEduTech %>%
#' preprocess_texts(text_field = "abstract",
#' verbose = FALSE) %>%
#' quanteda::dfm()
#' data <- tidytext::tidy(stm_15, matrix = "gamma", document_names = rownames(dfm), log = FALSE)
#' data %>% topic_probability_plot(top_n = 15)
#' }
Expand All @@ -292,9 +306,10 @@ examine_top_terms <-
#' @import ggplot2
#' @importFrom magrittr %>%
#' @importFrom stats reorder
#' @importFrom plotly ggplotly
#'
topic_probability_plot <-
function(data, top_n, topic_names = NULL, ...) {
function(data, top_n, ...) {

gamma_terms <- data %>%
group_by(topic) %>%
Expand All @@ -303,29 +318,6 @@ topic_probability_plot <-
mutate(topic = reorder(topic, gamma))

topic_by_prevalence_plot <- gamma_terms %>%
top_n(top_n, gamma) %>%
mutate(tt = as.numeric(topic)) %>%
mutate(ord = topic) %>%
mutate(topic = paste('Topic', topic)) %>% arrange(ord)

levelt = paste("Topic", topic_by_prevalence_plot$ord) %>% unique()

topic_by_prevalence_plot$topic = factor(topic_by_prevalence_plot$topic,
levels = levelt)
if (!is.null(topic_names)) {
reft = 1:length(topic_by_prevalence_plot$tt)
topic_by_prevalence_plot$topic =
topic_names[reft]
topic_by_prevalence_plot <-
topic_by_prevalence_plot %>%
mutate(topic = as.character(topic)) %>%
mutate(topic = ifelse(!is.na(topic), topic, paste('Topic', tt)))
topic_by_prevalence_plot$topic =
factor(topic_by_prevalence_plot$topic,
levels = topic_by_prevalence_plot$topic)
}

topic_by_prevalence_plot_output <- topic_by_prevalence_plot %>%
ggplot(aes(topic, gamma, fill = topic)) +
geom_col(alpha = 0.8) +
coord_flip() +
Expand All @@ -347,6 +339,8 @@ topic_probability_plot <-
axis.title.y = element_text(margin = margin(r = 9))
)

topic_by_prevalence_plot_output <- topic_by_prevalence_plot %>% ggplotly()

return(topic_by_prevalence_plot_output)
}

Expand All @@ -360,7 +354,6 @@ topic_probability_plot <-
#'
#' @param data A tidy data frame that includes per-document per-topic probabilities (gamma).
#' @param top_n A number of highest per-document per-topic probabilities (number of top_n can be changed).
#' @param topic_names Topic names
#' @param ... Further arguments passed.
#'
#' @export
Expand All @@ -369,7 +362,10 @@ topic_probability_plot <-
#'
#' @examples
#' if(requireNamespace("quanteda", "tidytext")){
#' dfm <- SpecialEduTech %>% preprocess_texts(text_field = "abstract") %>% quanteda::dfm()
#' dfm <- SpecialEduTech %>%
#' preprocess_texts(text_field = "abstract",
#' verbose = FALSE) %>%
#' quanteda::dfm()
#' data <- tidytext::tidy(stm_15, matrix = "gamma", document_names = rownames(dfm), log = FALSE)
#' data %>% topic_probability_table(top_n = 15)
#' }
Expand All @@ -380,7 +376,7 @@ topic_probability_plot <-
#' @importFrom stats reorder
#'
topic_probability_table <-
function(data, top_n, topic_names = NULL, ...) {
function(data, top_n, ...) {

gamma_terms <- data %>%
group_by(topic) %>%
Expand Down
2 changes: 1 addition & 1 deletion _pkgdown.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
url: https://mshin77.github.io/TextAnalysisR/
url: ~
template:
bootstrap: 5

5 changes: 4 additions & 1 deletion man/examine_top_terms.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/extract_frequent_word.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/plot_topic_term.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion man/plot_word_frequency.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 5 additions & 4 deletions man/topic_probability_plot.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 5 additions & 4 deletions man/topic_probability_table.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 11 additions & 24 deletions vignettes/Text-Analysis.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -22,94 +22,81 @@ install.packages("devtools")
devtools::install_github("mshin77/TextAnalysisR")
```

## Browse the interative Shiny app

Launch and browse the TextAnalysisR app:
## Launch and Browse the Shiny app

```{r, message=FALSE, eval=FALSE}
library(TextAnalysisR)
TextAnalysisR.app()
```

### Preprocess Text Data

Preprocess text data using the `preprocess_texts` function:

```{r, message=FALSE, eval=FALSE}
data <- TextAnalysisR::SpecialEduTech
# Preprocess text data
preprocessed_data <- preprocess_texts(data, text_field = "united_texts")
```

### Plot Word Frequency

Use the `plot_word_frequency` function to plot word frequency results:

```{r, message=FALSE, eval=FALSE}
# data is a document-feature matrix (dfm) object through the quanteda package.
# Plot word frequency for the top 20 terms.
# Plot word frequency for the top 20 terms
word_freq_plot <- plot_word_frequency(data, n = 20)
print(word_freq_plot)
```

### Extract Frequently Observed Words

Use the `extract_frequent_word` function to extract frequently observed top words:

```{r, message=FALSE, eval=FALSE}
# data is a document-feature matrix (dfm) object through the quanteda package.
# Extract the top 20 frequent words.
# Extract the top 20 frequent words
top_words <- extract_frequent_word(data, n = 20)
print(top_words)
```

### Plot Topic Per-Term Per-Topic Probabilities

Use the `plot_topic_term` function to visualize topic per-term per-topic probabilities:

```{r, message=FALSE, eval=FALSE}
# data is a tidy data frame that includes per-term per-topic probabilities (beta).
# Plot per-term per-topic probabilities for the top 10 terms.
# Plot per-term per-topic probabilities for the top 10 terms
topic_term_plot <- plot_topic_term(data, top_n = 10)
print(topic_term_plot)
```

### Examine Highest Per-Term Per-Topic Probabilities

Use the `examine_top_terms` function to examine the highest per-term per-topic probabilities:

```{r, message=FALSE, eval=FALSE}
# data is a tidy data frame that includes per-term per-topic probabilities (beta).
# Examine the top 5 terms with the highest per-term per-topic probabilities.
# Number of top_n can be changed.
# Examine the top 5 terms with the highest per-term per-topic probabilities (number of top_n can be changed).
top_terms <- examine_top_terms(data, top_n = 5)
print(top_terms)
```

### Plot Per-Document Per-Topic Probabilities

use the `topic_probability_plot` function to visualize per-document per-topic probabilities:

```{r, message=FALSE, eval=FALSE}
# data is a tidy data frame that includes per-document per-topic probabilities (gamma).
# Plot per-document per-topic probabilities for the top 15 topics.
# Number of top_n can be changed.
# Plot per-document per-topic probabilities for the top 15 topics (number of top_n can be changed)
topic_prob_plot <- topic_probability_plot(data, top_n = 15)
print(topic_prob_plot)
```

### Visualize a Table for Per-Document Per-Topic Probabilities

Use the `topic_probability_table` function to create a table of per-document per-topic probabilities:

```{r, message=FALSE, eval=FALSE}
# data is a tidy data frame that includes per-document per-topic probabilities (gamma).
# Create a table of per-document per-topic probabilities for the top 15 topics.
# Number of top_n can be changed.
# Create a table of per-document per-topic probabilities for the top 15 topics (number of top_n can be changed)
topic_prob_table <- topic_probability_table(gamma_td, top_n = 15)
print(topic_prob_table)
```
Expand Down

0 comments on commit 38f2b84

Please sign in to comment.