Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
mshin77 committed Mar 21, 2024
1 parent a13deb6 commit 3f283a6
Show file tree
Hide file tree
Showing 14 changed files with 492 additions and 96 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
R_KEEP_PKG_SOURCE: yes

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- uses: r-lib/actions/setup-pandoc@v2

Expand All @@ -47,3 +47,4 @@ jobs:
- uses: r-lib/actions/check-r-package@v2
with:
upload-snapshots: true
build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")'
4 changes: 2 additions & 2 deletions .github/workflows/pkgdown.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
permissions:
contents: write
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- uses: r-lib/actions/setup-pandoc@v2

Expand All @@ -41,7 +41,7 @@ jobs:

- name: Deploy to GitHub pages 🚀
if: github.event_name != 'pull_request'
uses: JamesIves/github-pages-deploy-action@v4.4.1
uses: JamesIves/github-pages-deploy-action@v4.5.0
with:
clean: false
branch: gh-pages
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/test-coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- uses: r-lib/actions/setup-r@v2
with:
Expand All @@ -39,12 +39,12 @@ jobs:
if: always()
run: |
## --------------------------------------------------------------------
find ${{ runner.temp }}/package -name 'testthat.Rout*' -exec cat '{}' \; || true
find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true
shell: bash

- name: Upload test results
if: failure()
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: coverage-test-failures
path: ${{ runner.temp }}/package
3 changes: 1 addition & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,14 @@ Language: en-US
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
Imports:
dplyr,
DT,
ggplot2,
ggraph,
magrittr,
numform,
purrr,
quanteda,
quanteda.textstats,
plotly,
Expand Down
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,5 @@ importFrom(rlang,.env)
importFrom(rlang,enquos)
importFrom(stats,reorder)
importFrom(tibble,tibble)
importFrom(tidyr,unnest)
importFrom(tidytext,reorder_within)
importFrom(tidytext,scale_x_reordered)
159 changes: 77 additions & 82 deletions R/text_mining_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -114,24 +114,20 @@ plot_word_frequency <-
}


# Display text mining results from the structural topic model ----

#' @title Plot topic per-term per-topic probabilities
#' @title Examine highest per-term per-topic probabilities
#'
#' @name plot_topic_term
#' @name examine_top_terms
#'
#' @description
#' Plot per-term per-topic probabilities with highest word probabilities.
#' Examine highest per-term per-topic probabilities.
#'
#' @param data A tidy data frame that includes per-term per-topic probabilities (beta).
#' @param top_n A number of highest per-term per-topic probabilities in each document (number of top_n can be changed).
#' @param ncol A number of columns in the facet plot.
#' @param topic_names (Labeled) topic names
#' @param ... Further arguments passed to \code{dplyr::group_by}.
#'
#' @export
#' @return A ggplot object output from \code{stm::stm}, \code{tidytext::tidy}, and \code{ggplot2::ggplot}.
#' The result is a ggplot object representing the topic-term plot.
#' @return A tibble (data frame) object with a list of word probabilities from \code{tidytext::tidy}.
#' The result is a data frame containing word probabilities for each topic.
#'
#' @examples
#' suppressWarnings({
Expand All @@ -140,81 +136,45 @@ plot_word_frequency <-
#' preprocess_texts(text_field = "abstract") %>%
#' quanteda::dfm()
#' data <- tidytext::tidy(stm_15, document_names = rownames(dfm), log = FALSE)
#' data %>% plot_topic_term(top_n = 2, ncol = 3)
#' data %>% examine_top_terms(top_n = 5) %>%
#' dplyr::mutate_if(is.numeric, ~ round(., 3)) %>%
#' DT::datatable(rownames = FALSE)
#' }
#' })
#'
#' @import dplyr
#' @import ggplot2
#' @importFrom magrittr %>%
#' @importFrom rlang := enquos
#' @importFrom tidytext scale_x_reordered reorder_within
#'
plot_topic_term <-
function(data, top_n, ncol = 3, topic_names = NULL, ...) {

topic_term_plot <- data %>%
group_by(topic, ...) %>%
top_n(top_n, beta) %>%
ungroup() %>%
mutate(
ord = factor(topic, levels = c(min(topic): max(topic))),
tt = as.numeric(topic),
topic = paste("Topic", topic),
term = reorder_within(term, beta, topic)) %>%
arrange(ord)
levelt = paste("Topic", topic_term_plot$ord) %>% unique()
topic_term_plot$topic = factor(topic_term_plot$topic,
levels = levelt)
if(!is.null(topic_names)){
topic_term_plot$topic = topic_names[topic_term_plot$tt]
topic_term_plot <- topic_term_plot %>%
mutate(topic = as.character(topic)) %>%
mutate(topic = ifelse(!is.na(topic), topic, paste('Topic',tt)))
topic_term_plot$topic =
factor(topic_term_plot$topic, levels = topic_term_plot$topic %>% unique())
}
topic_term_plot$tt = NULL
topic_term_plot <- topic_term_plot %>%
ggplot(aes(term, beta, fill = topic)) +
geom_col(show.legend = FALSE, alpha = 0.8) +
facet_wrap(~ topic, scales = "free", ncol = ncol) +
scale_x_reordered() +
scale_y_continuous(labels = numform::ff_num(zero = 0, digits = 3)) +
coord_flip() +
xlab("") +
ylab("Word probability") +
theme_minimal(base_size = 12) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(color = "#3B3B3B", size = 0.3),
axis.ticks = element_line(color = "#3B3B3B", size = 0.3),
strip.text.x = element_text(size = 12, color = "#3B3B3B"),
axis.text.x = element_text(size = 12, color = "#3B3B3B"),
axis.text.y = element_text(size = 12, color = "#3B3B3B"),
axis.title = element_text(size = 12, color = "#3B3B3B"),
axis.title.x = element_text(margin = margin(t = 7)),
axis.title.y = element_text(margin = margin(r = 7)))

return(topic_term_plot)
}
examine_top_terms <-

function(data, top_n, ...) {
topic_term <- data %>%
group_by(topic, ...) %>%
top_n(top_n, beta) %>%
ungroup()

#' @title Examine highest per-term per-topic probabilities
return(topic_term)
}


# Display text mining results from the structural topic model ----

#' @title Plot topic per-term per-topic probabilities
#'
#' @name examine_top_terms
#' @name plot_topic_term
#'
#' @description
#' Examine highest per-term per-topic probabilities.
#' Plot per-term per-topic probabilities with highest word probabilities.
#'
#' @param data A tidy data frame that includes per-term per-topic probabilities (beta).
#' @param top_n A number of highest per-term per-topic probabilities in each document (number of top_n can be changed).
#' @param ncol A number of columns in the facet plot.
#' @param topic_names (Labeled) topic names
#' @param ... Further arguments passed to \code{dplyr::group_by}.
#'
#' @export
#' @return A tibble (data frame) object with a list of word probabilities from \code{tidytext::tidy}.
#' The result is a data frame containing word probabilities for each topic.
#' @return A ggplot object output from \code{stm::stm}, \code{tidytext::tidy}, and \code{ggplot2::ggplot}.
#' The result is a ggplot object representing the topic-term plot.
#'
#' @examples
#' suppressWarnings({
Expand All @@ -223,30 +183,65 @@ plot_topic_term <-
#' preprocess_texts(text_field = "abstract") %>%
#' quanteda::dfm()
#' data <- tidytext::tidy(stm_15, document_names = rownames(dfm), log = FALSE)
#' data %>% examine_top_terms(top_n = 5) %>% DT::datatable(rownames = FALSE)
#' data %>% examine_top_terms(top_n = 2) %>%
#' plot_topic_term(ncol = 3)
#' }
#' })
#'
#' @import dplyr
#' @import ggplot2
#' @importFrom magrittr %>%
#' @importFrom rlang := enquos
#' @importFrom tidyr unnest
#' @importFrom DT datatable
#' @importFrom tidytext scale_x_reordered reorder_within
#'
examine_top_terms <-
plot_topic_term <-
function(data, ncol = ncol, topic_names = NULL, ...) {

function(data, top_n, ...) {
top_terms <- data %>%
arrange(beta) %>%
group_by(topic, ...) %>%
top_n(top_n, beta) %>%
arrange(beta) %>%
select(topic, term) %>%
summarise(terms = list(term)) %>%
mutate(terms = purrr::map(terms, paste, collapse = ", ")) %>%
unnest(cols = c(terms))
topic_term <- data %>%
mutate(
ord = factor(topic, levels = c(min(topic): max(topic))),
tt = as.numeric(topic),
topic = paste("Topic", topic),
term = reorder_within(term, beta, topic)) %>%
arrange(ord)

levelt = paste("Topic", topic_term$ord) %>% unique()

topic_term$topic = factor(topic_term$topic,
levels = levelt)
if(!is.null(topic_names)){
topic_term$topic = topic_names[topic_term$tt]
topic_term <- topic_term %>%
mutate(topic = as.character(topic)) %>%
mutate(topic = ifelse(!is.na(topic), topic, paste('Topic',tt)))
topic_term$topic =
factor(topic_term$topic, levels = topic_term$topic %>% unique())
}

topic_term$tt = NULL

topic_term_plot <- ggplot(topic_term, aes(term, beta, fill = topic)) +
geom_col(show.legend = FALSE, alpha = 0.8) +
facet_wrap(~ topic, scales = "free", ncol = ncol) +
scale_x_reordered() +
scale_y_continuous(labels = numform::ff_num(zero = 0, digits = 3)) +
coord_flip() +
xlab("") +
ylab("Word probability") +
theme_minimal(base_size = 12) +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(color = "#3B3B3B", size = 0.3),
axis.ticks = element_line(color = "#3B3B3B", size = 0.3),
strip.text.x = element_text(size = 12, color = "#3B3B3B"),
axis.text.x = element_text(size = 12, color = "#3B3B3B"),
axis.text.y = element_text(size = 12, color = "#3B3B3B"),
axis.title = element_text(size = 12, color = "#3B3B3B"),
axis.title.x = element_text(margin = margin(t = 7)),
axis.title.y = element_text(margin = margin(r = 7)))

return(top_terms)
return(topic_term_plot)
}


Expand Down
14 changes: 14 additions & 0 deletions codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
comment: false

coverage:
status:
project:
default:
target: auto
threshold: 1%
informational: true
patch:
default:
target: auto
threshold: 1%
informational: true
Binary file added inst/TextAnalysisR.app/source/SpecialEduTech.rda
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added inst/TextAnalysisR.app/source/stopwords_list.rda
Binary file not shown.
Loading

0 comments on commit 3f283a6

Please sign in to comment.