Adding new functions and a bit of extra functionality to pre-existing…

… e.g. make_bigram_viz has ellipsis, umap shiny ha x, y & size args.
jpcompartir · Aug 28, 2022 · d1584eb · d1584eb
1 parent 6f536e7
commit d1584eb
Show file tree

Hide file tree

Showing 14 changed files with 250 additions and 15 deletions.
diff --git a/.Rhistory b/.Rhistory
@@ -1,15 +1,3 @@
-tidyr::unnest(coherence, ...)%>%
-ggplot2::ggplot(ggplot2::aes(x = topic, y = coherence, fill = topic, ...))+
-ggplot2::geom_col(...)+
-ggplot2::theme_light(...)+
-ggplot2::scale_fill_viridis_d(...)+
-ggplot2::theme(legend.position = "none",
-axis.title = ggplot2::element_text(size = 14))+
-ggplot2::labs(x = NULL, ...)
-}
-rm(list=ls())
-document()
-check()
 globalVariables(c("k", "coherence", "topic","text", "permalink", "tone" ,"mention_content", "n_gram", "count", "ngram", "df", "data","pattern", "before", "after"))
 use_package('tidyr')
 use_package('ggplot2')
@@ -510,3 +498,15 @@ system('git stash .')
 system("git stage .")
 system('git merge main')
 library(JPackage)
+library(devtools)
+library(roxygen2)
+#roxygenise(clean = TRUE)
+document()
+system("git stage .")
+use_r("sample_pull")
+document()
+document()
+check()
+system("git stage .")
+document()
+?ifelse
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,19 +1,24 @@
 # Generated by roxygen2: do not edit by hand
 
 export("%>%")
+export(clean_embeds)
+export(clean_entities)
 export(context_grabber)
 export(count_multiple)
 export(fix_radarly)
 export(get_tags)
 export(huggingface_quick_clean)
 export(make_bigram_viz)
+export(make_top_terms_table)
 export(par_fit_LDAs)
 export(percent_summ)
 export(plot_bars_coherence)
 export(plot_group_bigrams)
 export(plot_k_coherence)
 export(sample_pull)
+export(save_plot_list)
 export(spam_grams)
 export(tidy_huggingface_zeroshot)
+export(top_terms_plot)
 export(umap_shiny)
 importFrom(magrittr,"%>%")
diff --git a/R/clean_embeddings.R b/R/clean_embeddings.R
@@ -0,0 +1,16 @@
+#' Quickly reformat a sentence transformers embedding output
+#'
+#' @param embeds an n-dimensional data frame with an index column (Pandas output without index = False)
+#'
+#' @return data frame with column names cleaned and index column removed
+#' @export
+#'
+clean_embeds <- function(embeds){
+  embeds <- embeds %>%
+    dplyr::select(-1) %>%
+    janitor::clean_names()
+
+  names(embeds) <- stringr::str_replace_all(names(embeds), "x", "dim_")
+
+  return(embeds)
+}
diff --git a/R/clean_entities.R b/R/clean_entities.R
@@ -0,0 +1,24 @@
+#' Quickly extract columns of interest from Hugging Face entity recognisers
+#'
+#' @param df data frame with entities extracted in JSON format
+#'
+#' @return tidier data frame for analysis.
+#' @export
+#'
+clean_entities <- function(df) {
+  df %>%
+    janitor::clean_names()%>%
+    dplyr::rename(document = x1)%>%
+    tidyr::pivot_longer(dplyr::contains('x'))%>%
+    dplyr::filter(!is.na(value)) %>%
+    tidyr::separate(value, into = c("entity_type", "entity_score","entity",  "index"), sep = ",")%>%
+    dplyr::mutate(entity_type = stringr::str_remove(entity_type, "\\{'entity_group': '"),
+                  entity_type = stringr::str_remove(entity_type, "'"),
+                  entity_score = stringr::str_remove(entity_score, " 'score': "),
+                  entity_score = stringr::str_remove(entity_score, " '"),
+                  entity = stringr::str_remove(entity, " 'word': '"),
+                  entity = stringr::str_remove(entity, "'"))%>%
+    dplyr::mutate(entity_score = round(as.numeric(entity_score), 3))%>%
+    dplyr::select(-name, -index)
+
+}
diff --git a/R/make_bigram_viz.R b/R/make_bigram_viz.R
@@ -8,9 +8,9 @@
 #' @return A ggplot2 network viz
 #' @export
 
-make_bigram_viz <- function(data, text_var = mention_content, top_n = 50, min = 10){
+make_bigram_viz <- function(data, text_var = mention_content, top_n = 50, min = 10, ...){
   data %>%
-    ParseR::count_ngram(text_var = {{text_var }}, top_n = top_n, min_freq = min)%>%
+    ParseR::count_ngram(text_var = {{text_var }}, top_n = top_n, min_freq = min, ...)%>%
     purrr::pluck("viz")%>%
     ParseR::viz_ngram()
 }
diff --git a/R/make_top_terms_table.R b/R/make_top_terms_table.R
@@ -0,0 +1,31 @@
+#' Quickly make a top terms table from UMAP/Sentence Transformers/kMeans clusters
+#'
+#' @param df Data frame object with clusters/topics
+#' @param group_var cluster/topic variable
+#' @param text_var the text variable
+#'
+#' @return a data frame with tokens counted
+#' @export
+#'
+make_top_terms_table <- function(df, group_var = cluster_name, text_var = mention_content){
+
+  text_quo <- rlang::enquo(text_var)
+  text_sym <- rlang::ensym(text_var)
+
+  group_quo <- rlang::enquo(group_var)
+  group_sym <- rlang::ensym(group_var)
+
+
+  df %>%
+    dplyr::group_split({{group_var}}) %>%
+    purrr::map(~.x %>%
+                 dplyr::mutate(!!text_quo := !! text_sym) %>%
+                 dplyr::mutate(!!text_quo := tm::removeNumbers(!! text_sym)) %>%
+                 LimpiaR::limpiar_url(!!text_quo) %>%
+                 LimpiaR::limpiar_spaces(!!text_quo) %>%
+                 tidytext::unnest_tokens(words, !! text_sym)) %>%
+    purrr::reduce(bind_rows) %>%
+    dplyr::count({{group_var}}, words, sort = TRUE) %>%
+    dplyr::mutate(!!group_quo := stringr::str_replace_all(!! group_sym, "_", " "),
+                  !!group_quo := stringr::str_to_title(!! group_sym))
+}
diff --git a/R/plot_top_terms.R b/R/plot_top_terms.R
@@ -0,0 +1,43 @@
+#' Make faceted lollipop charts for top terms from a cluster
+#'
+#' @param top_terms_table the output of make_top_terms_table()
+#' @param words_var the name of the variable in which word tokens have been saved
+#' @param group_var cluster / topic / grouping variable
+#' @param top_n number of terms per plot
+#' @param nrow number of rows to display plots on
+#'
+#' @return faceted lollipops
+#' @export
+#'
+top_terms_plot <- function(top_terms_table, words_var = words, group_var = cluster_name, top_n = 15, nrow = 2){
+
+  words_quo <- rlang::enquo(words_var)
+  words_sym <- rlang::ensym(words_var)
+
+  group_quo <- rlang::enquo(group_var)
+  group_sym <- rlang::ensym(group_var)
+
+  top_terms_table %>%
+    dplyr::group_by({{group_var}}) %>%
+    dplyr::slice_max(order_by = n, n = top_n, with_ties = FALSE) %>%
+    dplyr::ungroup() %>%
+    dplyr::mutate(!! words_quo := tidytext::reorder_within(!! words_sym, n, !!group_sym)) %>%
+    ggplot2::ggplot(aes(x= {{words_var}} , y = n, fill = !! group_sym,color = !! group_sym)) +
+    # geom_col(position = "dodge", show.legend = FALSE)+
+    ggplot2::geom_segment(aes(x = {{words_var}}, xend = {{words_var}},
+                              y = 0, yend = n),
+                          show.legend = FALSE) +
+    ggplot2::geom_point(size = 3,
+                        shape = 21,
+                        show.legend = FALSE) +
+    ggplot2::coord_flip() +
+    HelpR::theme_microsoft_discrete() +
+    ggplot2::facet_wrap(ggplot2::vars(!!group_sym),
+                        scales = "free",
+                        nrow = nrow)+
+    ggplot2::theme_minimal()+
+    tidytext::scale_x_reordered("Term") +
+    ggplot2::theme(strip.text = ggplot2::element_text(size = 12))+
+    ggplot2::labs(x = NULL,
+                  y = "Term Frequency")
+}
diff --git a/R/save_plot_list.R b/R/save_plot_list.R
@@ -0,0 +1,11 @@
+#' Save a list of plots
+#'
+#' @param plot_list Should be a named list of plots
+#' @param plot_type For saving prefix that gets pasted with name and .png = e.g. "bigram_microsoft_topic_1.png"
+#'
+#' @return does not return anything - saves plots to working directory
+#' @export
+#'
+save_plot_list <- function(plot_list, plot_type = "bigram"){
+  lapply(names(plot_list),function(x) ggplot2::ggsave(filename = paste(plot_type, "_", x, ".png", sep = ""), bg = "white", plot = bigrams_list[[x]]))
+}
diff --git a/man/clean_embeds.Rd b/man/clean_embeds.Rd
diff --git a/man/clean_entities.Rd b/man/clean_entities.Rd
diff --git a/man/make_bigram_viz.Rd b/man/make_bigram_viz.Rd
diff --git a/man/make_top_terms_table.Rd b/man/make_top_terms_table.Rd
diff --git a/man/save_plot_list.Rd b/man/save_plot_list.Rd
diff --git a/man/top_terms_plot.Rd b/man/top_terms_plot.Rd