Skip to content

Commit

Permalink
Adding new functions and a bit of extra functionality to pre-existing…
Browse files Browse the repository at this point in the history
… e.g. make_bigram_viz has ellipsis, umap shiny ha x, y & size args.
  • Loading branch information
jpcompartir committed Aug 28, 2022
1 parent 6f536e7 commit d1584eb
Show file tree
Hide file tree
Showing 14 changed files with 250 additions and 15 deletions.
24 changes: 12 additions & 12 deletions .Rhistory
Original file line number Diff line number Diff line change
@@ -1,15 +1,3 @@
tidyr::unnest(coherence, ...)%>%
ggplot2::ggplot(ggplot2::aes(x = topic, y = coherence, fill = topic, ...))+
ggplot2::geom_col(...)+
ggplot2::theme_light(...)+
ggplot2::scale_fill_viridis_d(...)+
ggplot2::theme(legend.position = "none",
axis.title = ggplot2::element_text(size = 14))+
ggplot2::labs(x = NULL, ...)
}
rm(list=ls())
document()
check()
globalVariables(c("k", "coherence", "topic","text", "permalink", "tone" ,"mention_content", "n_gram", "count", "ngram", "df", "data","pattern", "before", "after"))
use_package('tidyr')
use_package('ggplot2')
Expand Down Expand Up @@ -510,3 +498,15 @@ system('git stash .')
system("git stage .")
system('git merge main')
library(JPackage)
library(devtools)
library(roxygen2)
#roxygenise(clean = TRUE)
document()
system("git stage .")
use_r("sample_pull")
document()
document()
check()
system("git stage .")
document()
?ifelse
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,19 +1,24 @@
# Generated by roxygen2: do not edit by hand

export("%>%")
export(clean_embeds)
export(clean_entities)
export(context_grabber)
export(count_multiple)
export(fix_radarly)
export(get_tags)
export(huggingface_quick_clean)
export(make_bigram_viz)
export(make_top_terms_table)
export(par_fit_LDAs)
export(percent_summ)
export(plot_bars_coherence)
export(plot_group_bigrams)
export(plot_k_coherence)
export(sample_pull)
export(save_plot_list)
export(spam_grams)
export(tidy_huggingface_zeroshot)
export(top_terms_plot)
export(umap_shiny)
importFrom(magrittr,"%>%")
16 changes: 16 additions & 0 deletions R/clean_embeddings.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#' Quickly reformat a sentence transformers embedding output
#'
#' @param embeds an n-dimensional data frame with an index column (Pandas output without index = False)
#'
#' @return data frame with column names cleaned and index column removed
#' @export
#'
clean_embeds <- function(embeds){
embeds <- embeds %>%
dplyr::select(-1) %>%
janitor::clean_names()

names(embeds) <- stringr::str_replace_all(names(embeds), "x", "dim_")

return(embeds)
}
24 changes: 24 additions & 0 deletions R/clean_entities.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#' Quickly extract columns of interest from Hugging Face entity recognisers
#'
#' @param df data frame with entities extracted in JSON format
#'
#' @return tidier data frame for analysis.
#' @export
#'
clean_entities <- function(df) {
df %>%
janitor::clean_names()%>%
dplyr::rename(document = x1)%>%
tidyr::pivot_longer(dplyr::contains('x'))%>%
dplyr::filter(!is.na(value)) %>%
tidyr::separate(value, into = c("entity_type", "entity_score","entity", "index"), sep = ",")%>%
dplyr::mutate(entity_type = stringr::str_remove(entity_type, "\\{'entity_group': '"),
entity_type = stringr::str_remove(entity_type, "'"),
entity_score = stringr::str_remove(entity_score, " 'score': "),
entity_score = stringr::str_remove(entity_score, " '"),
entity = stringr::str_remove(entity, " 'word': '"),
entity = stringr::str_remove(entity, "'"))%>%
dplyr::mutate(entity_score = round(as.numeric(entity_score), 3))%>%
dplyr::select(-name, -index)

}
4 changes: 2 additions & 2 deletions R/make_bigram_viz.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
#' @return A ggplot2 network viz
#' @export

make_bigram_viz <- function(data, text_var = mention_content, top_n = 50, min = 10){
make_bigram_viz <- function(data, text_var = mention_content, top_n = 50, min = 10, ...){
data %>%
ParseR::count_ngram(text_var = {{text_var }}, top_n = top_n, min_freq = min)%>%
ParseR::count_ngram(text_var = {{text_var }}, top_n = top_n, min_freq = min, ...)%>%
purrr::pluck("viz")%>%
ParseR::viz_ngram()
}
31 changes: 31 additions & 0 deletions R/make_top_terms_table.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#' Quickly make a top terms table from UMAP/Sentence Transformers/kMeans clusters
#'
#' @param df Data frame object with clusters/topics
#' @param group_var cluster/topic variable
#' @param text_var the text variable
#'
#' @return a data frame with tokens counted
#' @export
#'
make_top_terms_table <- function(df, group_var = cluster_name, text_var = mention_content){

text_quo <- rlang::enquo(text_var)
text_sym <- rlang::ensym(text_var)

group_quo <- rlang::enquo(group_var)
group_sym <- rlang::ensym(group_var)


df %>%
dplyr::group_split({{group_var}}) %>%
purrr::map(~.x %>%
dplyr::mutate(!!text_quo := !! text_sym) %>%
dplyr::mutate(!!text_quo := tm::removeNumbers(!! text_sym)) %>%
LimpiaR::limpiar_url(!!text_quo) %>%
LimpiaR::limpiar_spaces(!!text_quo) %>%
tidytext::unnest_tokens(words, !! text_sym)) %>%
purrr::reduce(bind_rows) %>%
dplyr::count({{group_var}}, words, sort = TRUE) %>%
dplyr::mutate(!!group_quo := stringr::str_replace_all(!! group_sym, "_", " "),
!!group_quo := stringr::str_to_title(!! group_sym))
}
43 changes: 43 additions & 0 deletions R/plot_top_terms.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#' Make faceted lollipop charts for top terms from a cluster
#'
#' @param top_terms_table the output of make_top_terms_table()
#' @param words_var the name of the variable in which word tokens have been saved
#' @param group_var cluster / topic / grouping variable
#' @param top_n number of terms per plot
#' @param nrow number of rows to display plots on
#'
#' @return faceted lollipops
#' @export
#'
top_terms_plot <- function(top_terms_table, words_var = words, group_var = cluster_name, top_n = 15, nrow = 2){

words_quo <- rlang::enquo(words_var)
words_sym <- rlang::ensym(words_var)

group_quo <- rlang::enquo(group_var)
group_sym <- rlang::ensym(group_var)

top_terms_table %>%
dplyr::group_by({{group_var}}) %>%
dplyr::slice_max(order_by = n, n = top_n, with_ties = FALSE) %>%
dplyr::ungroup() %>%
dplyr::mutate(!! words_quo := tidytext::reorder_within(!! words_sym, n, !!group_sym)) %>%
ggplot2::ggplot(aes(x= {{words_var}} , y = n, fill = !! group_sym,color = !! group_sym)) +
# geom_col(position = "dodge", show.legend = FALSE)+
ggplot2::geom_segment(aes(x = {{words_var}}, xend = {{words_var}},
y = 0, yend = n),
show.legend = FALSE) +
ggplot2::geom_point(size = 3,
shape = 21,
show.legend = FALSE) +
ggplot2::coord_flip() +
HelpR::theme_microsoft_discrete() +
ggplot2::facet_wrap(ggplot2::vars(!!group_sym),
scales = "free",
nrow = nrow)+
ggplot2::theme_minimal()+
tidytext::scale_x_reordered("Term") +
ggplot2::theme(strip.text = ggplot2::element_text(size = 12))+
ggplot2::labs(x = NULL,
y = "Term Frequency")
}
11 changes: 11 additions & 0 deletions R/save_plot_list.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#' Save a list of plots
#'
#' @param plot_list Should be a named list of plots
#' @param plot_type For saving prefix that gets pasted with name and .png = e.g. "bigram_microsoft_topic_1.png"
#'
#' @return does not return anything - saves plots to working directory
#' @export
#'
save_plot_list <- function(plot_list, plot_type = "bigram"){
lapply(names(plot_list),function(x) ggplot2::ggsave(filename = paste(plot_type, "_", x, ".png", sep = ""), bg = "white", plot = bigrams_list[[x]]))
}
17 changes: 17 additions & 0 deletions man/clean_embeds.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/clean_entities.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/make_bigram_viz.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions man/make_top_terms_table.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions man/save_plot_list.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

31 changes: 31 additions & 0 deletions man/top_terms_plot.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit d1584eb

Please sign in to comment.