update

mshin77 · Jan 16, 2024 · 38f2b84 · 38f2b84
1 parent 1ee137f
commit 38f2b84
Show file tree

Hide file tree

Showing 11 changed files with 69 additions and 70 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -17,6 +17,7 @@ Imports:
     ggraph,
     magrittr,
     numform,
+    plotly,
     purrr,
     quanteda,
     quanteda.textstats,

diff --git a/NAMESPACE b/NAMESPACE
@@ -24,6 +24,7 @@ importFrom(ggplot2,ggplot)
 importFrom(ggplot2,labs)
 importFrom(ggplot2,theme_bw)
 importFrom(magrittr,"%>%")
+importFrom(plotly,ggplotly)
 importFrom(rlang,":=")
 importFrom(rlang,.data)
 importFrom(rlang,.env)

diff --git a/R/text_mining_functions.R b/R/text_mining_functions.R
@@ -87,7 +87,10 @@ preprocess_texts <-
 #'
 #' @examples
 #' if(requireNamespace("quanteda")){
-#' dfm <- SpecialEduTech %>% preprocess_texts(text_field = "abstract") %>% quanteda::dfm()
+#' dfm <- SpecialEduTech %>%
+#'        preprocess_texts(text_field = "abstract",
+#'        verbose = FALSE) %>%
+#'        quanteda::dfm()
 #' dfm %>% plot_word_frequency(n = 20)
 #' }
 #'
@@ -125,7 +128,10 @@ plot_word_frequency <-
 #'
 #' @examples
 #' if(requireNamespace("quanteda")){
-#' dfm <- SpecialEduTech %>% preprocess_texts(text_field = "abstract") %>% quanteda::dfm()
+#' dfm <- SpecialEduTech %>%
+#'        preprocess_texts(text_field = "abstract",
+#'        verbose = FALSE) %>%
+#'        quanteda::dfm()
 #' dfm %>% extract_frequent_word()
 #' }
 #'
@@ -162,7 +168,10 @@ extract_frequent_word <-
 #'
 #' @examples
 #' if(requireNamespace("quanteda", "tidytext")){
-#' dfm <- SpecialEduTech %>% preprocess_texts(text_field = "abstract") %>% quanteda::dfm()
+#' dfm <- SpecialEduTech %>%
+#'        preprocess_texts(text_field = "abstract",
+#'        verbose = FALSE) %>%
+#'        quanteda::dfm()
 #' data <- tidytext::tidy(stm_15, document_names = rownames(dfm), log = FALSE)
 #' data %>% plot_topic_term(top_n = 10)
 #' }
@@ -241,7 +250,10 @@ plot_topic_term <-
 #'
 #' @examples
 #' if(requireNamespace("quanteda", "tidytext")){
-#' dfm <- SpecialEduTech %>% preprocess_texts(text_field = "abstract") %>% quanteda::dfm()
+#' dfm <- SpecialEduTech %>%
+#'        preprocess_texts(text_field = "abstract",
+#'        verbose = FALSE) %>%
+#'        quanteda::dfm()
 #' data <- tidytext::tidy(stm_15, document_names = rownames(dfm), log = FALSE)
 #' data %>% examine_top_terms(top_n = 5)
 #' }
@@ -275,15 +287,17 @@ examine_top_terms <-
 #'
 #' @param data A tidy data frame that includes per-document per-topic probabilities (gamma).
 #' @param top_n A number of highest per-document per-topic probabilities (number of top_n can be changed).
-#' @param topic_names Topic names
 #' @param ... Further arguments passed.
 #'
 #' @export
 #' @return A ggplot object output from \code{stm::stm}, \code{tidytext::tidy}, and \code{ggplot2::ggplot}.
 #'
 #' @examples
 #' if(requireNamespace("quanteda", "tidytext")){
-#' dfm <- SpecialEduTech %>% preprocess_texts(text_field = "abstract") %>% quanteda::dfm()
+#' dfm <- SpecialEduTech %>%
+#'        preprocess_texts(text_field = "abstract",
+#'        verbose = FALSE) %>%
+#'        quanteda::dfm()
 #' data <- tidytext::tidy(stm_15, matrix = "gamma", document_names = rownames(dfm), log = FALSE)
 #' data %>% topic_probability_plot(top_n = 15)
 #' }
@@ -292,9 +306,10 @@ examine_top_terms <-
 #' @import ggplot2
 #' @importFrom magrittr %>%
 #' @importFrom stats reorder
+#' @importFrom plotly ggplotly
 #'
 topic_probability_plot <-
-  function(data, top_n, topic_names = NULL, ...) {
+  function(data, top_n, ...) {
 
     gamma_terms <- data %>%
       group_by(topic) %>%
@@ -303,29 +318,6 @@ topic_probability_plot <-
       mutate(topic = reorder(topic, gamma))
 
     topic_by_prevalence_plot <- gamma_terms %>%
-      top_n(top_n, gamma) %>%
-      mutate(tt = as.numeric(topic)) %>%
-      mutate(ord = topic) %>%
-      mutate(topic = paste('Topic', topic)) %>%  arrange(ord)
-
-    levelt = paste("Topic", topic_by_prevalence_plot$ord) %>% unique()
-
-    topic_by_prevalence_plot$topic = factor(topic_by_prevalence_plot$topic,
-                                            levels = levelt)
-    if (!is.null(topic_names)) {
-      reft  = 1:length(topic_by_prevalence_plot$tt)
-      topic_by_prevalence_plot$topic =
-        topic_names[reft]
-      topic_by_prevalence_plot <-
-        topic_by_prevalence_plot %>%
-        mutate(topic = as.character(topic)) %>%
-        mutate(topic = ifelse(!is.na(topic), topic, paste('Topic', tt)))
-      topic_by_prevalence_plot$topic =
-        factor(topic_by_prevalence_plot$topic,
-               levels = topic_by_prevalence_plot$topic)
-    }
-
-    topic_by_prevalence_plot_output <- topic_by_prevalence_plot %>%
       ggplot(aes(topic, gamma, fill = topic)) +
       geom_col(alpha = 0.8) +
       coord_flip() +
@@ -347,6 +339,8 @@ topic_probability_plot <-
         axis.title.y = element_text(margin = margin(r = 9))
       )
 
+    topic_by_prevalence_plot_output <- topic_by_prevalence_plot %>% ggplotly()
+
         return(topic_by_prevalence_plot_output)
     }
 
@@ -360,7 +354,6 @@ topic_probability_plot <-
 #'
 #' @param data A tidy data frame that includes per-document per-topic probabilities (gamma).
 #' @param top_n A number of highest per-document per-topic probabilities (number of top_n can be changed).
-#' @param topic_names Topic names
 #' @param ... Further arguments passed.
 #'
 #' @export
@@ -369,7 +362,10 @@ topic_probability_plot <-
 #'
 #' @examples
 #' if(requireNamespace("quanteda", "tidytext")){
-#' dfm <- SpecialEduTech %>% preprocess_texts(text_field = "abstract") %>% quanteda::dfm()
+#' dfm <- SpecialEduTech %>%
+#'        preprocess_texts(text_field = "abstract",
+#'        verbose = FALSE) %>%
+#'        quanteda::dfm()
 #' data <- tidytext::tidy(stm_15, matrix = "gamma", document_names = rownames(dfm), log = FALSE)
 #' data %>% topic_probability_table(top_n = 15)
 #' }
@@ -380,7 +376,7 @@ topic_probability_plot <-
 #' @importFrom stats reorder
 #'
 topic_probability_table <-
-    function(data, top_n, topic_names = NULL, ...) {
+    function(data, top_n, ...) {
 
       gamma_terms <- data %>%
         group_by(topic) %>%

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -1,4 +1,4 @@
-url: https://mshin77.github.io/TextAnalysisR/
+url: ~
 template:
   bootstrap: 5
 
diff --git a/man/examine_top_terms.Rd b/man/examine_top_terms.Rd
diff --git a/man/extract_frequent_word.Rd b/man/extract_frequent_word.Rd
diff --git a/man/plot_topic_term.Rd b/man/plot_topic_term.Rd
diff --git a/man/plot_word_frequency.Rd b/man/plot_word_frequency.Rd
diff --git a/man/topic_probability_plot.Rd b/man/topic_probability_plot.Rd
diff --git a/man/topic_probability_table.Rd b/man/topic_probability_table.Rd
diff --git a/vignettes/Text-Analysis.Rmd b/vignettes/Text-Analysis.Rmd
@@ -22,94 +22,81 @@ install.packages("devtools")
 devtools::install_github("mshin77/TextAnalysisR")
 ```
 
-## Browse the interative Shiny app
-
-Launch and browse the TextAnalysisR app:
+## Launch and Browse the Shiny app
 
 ```{r, message=FALSE, eval=FALSE}
 library(TextAnalysisR)
+
 TextAnalysisR.app()
 ```
 
 ### Preprocess Text Data
 
-Preprocess text data using the `preprocess_texts` function:
-
 ```{r, message=FALSE, eval=FALSE}
 data <- TextAnalysisR::SpecialEduTech 
 
-# Preprocess text data
 preprocessed_data <- preprocess_texts(data, text_field = "united_texts")
 ```
 
 ### Plot Word Frequency
 
-Use the `plot_word_frequency` function to plot word frequency results:
-
 ```{r, message=FALSE, eval=FALSE}
 # data is a document-feature matrix (dfm) object through the quanteda package.
+# Plot word frequency for the top 20 terms.
 
-# Plot word frequency for the top 20 terms
 word_freq_plot <- plot_word_frequency(data, n = 20)
 print(word_freq_plot)
 ```
 
 ### Extract Frequently Observed Words
 
-Use the `extract_frequent_word` function to extract frequently observed top words:
-
 ```{r, message=FALSE, eval=FALSE}
 # data is a document-feature matrix (dfm) object through the quanteda package.
+# Extract the top 20 frequent words.
 
-# Extract the top 20 frequent words
 top_words <- extract_frequent_word(data, n = 20)
 print(top_words)
 ```
 
 ### Plot Topic Per-Term Per-Topic Probabilities
 
-Use the `plot_topic_term` function to visualize topic per-term per-topic probabilities:
-
 ```{r, message=FALSE, eval=FALSE}
 # data is a tidy data frame that includes per-term per-topic probabilities (beta).
+# Plot per-term per-topic probabilities for the top 10 terms.
 
-# Plot per-term per-topic probabilities for the top 10 terms
 topic_term_plot <- plot_topic_term(data, top_n = 10)
 print(topic_term_plot)
 ```
 
 ### Examine Highest Per-Term Per-Topic Probabilities
 
-Use the `examine_top_terms` function to examine the highest per-term per-topic probabilities:
-
 ```{r, message=FALSE, eval=FALSE}
 # data is a tidy data frame that includes per-term per-topic probabilities (beta).
+# Examine the top 5 terms with the highest per-term per-topic probabilities.
+# Number of top_n can be changed.
 
-# Examine the top 5 terms with the highest per-term per-topic probabilities (number of top_n can be changed).
 top_terms <- examine_top_terms(data, top_n = 5)
 print(top_terms)
 ```
 
 ### Plot Per-Document Per-Topic Probabilities
 
-use the `topic_probability_plot` function to visualize per-document per-topic probabilities:
-
 ```{r, message=FALSE, eval=FALSE}
 # data is a tidy data frame that includes per-document per-topic probabilities (gamma).
+# Plot per-document per-topic probabilities for the top 15 topics.
+# Number of top_n can be changed.
 
-# Plot per-document per-topic probabilities for the top 15 topics (number of top_n can be changed)
 topic_prob_plot <- topic_probability_plot(data, top_n = 15)
 print(topic_prob_plot)
 ```
 
 ### Visualize a Table for Per-Document Per-Topic Probabilities
 
-Use the `topic_probability_table` function to create a table of per-document per-topic probabilities:
-
 ```{r, message=FALSE, eval=FALSE}
 # data is a tidy data frame that includes per-document per-topic probabilities (gamma).
+# Create a table of per-document per-topic probabilities for the top 15 topics.
+# Number of top_n can be changed.
 
-# Create a table of per-document per-topic probabilities for the top 15 topics (number of top_n can be changed)
 topic_prob_table <- topic_probability_table(gamma_td, top_n = 15)
 print(topic_prob_table)
 ```