-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
… e.g. make_bigram_viz has ellipsis, umap shiny ha x, y & size args.
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,24 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
export("%>%") | ||
export(clean_embeds) | ||
export(clean_entities) | ||
export(context_grabber) | ||
export(count_multiple) | ||
export(fix_radarly) | ||
export(get_tags) | ||
export(huggingface_quick_clean) | ||
export(make_bigram_viz) | ||
export(make_top_terms_table) | ||
export(par_fit_LDAs) | ||
export(percent_summ) | ||
export(plot_bars_coherence) | ||
export(plot_group_bigrams) | ||
export(plot_k_coherence) | ||
export(sample_pull) | ||
export(save_plot_list) | ||
export(spam_grams) | ||
export(tidy_huggingface_zeroshot) | ||
export(top_terms_plot) | ||
export(umap_shiny) | ||
importFrom(magrittr,"%>%") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#' Quickly reformat a sentence transformers embedding output | ||
#' | ||
#' @param embeds an n-dimensional data frame with an index column (Pandas output without index = False) | ||
#' | ||
#' @return data frame with column names cleaned and index column removed | ||
#' @export | ||
#' | ||
clean_embeds <- function(embeds){ | ||
embeds <- embeds %>% | ||
dplyr::select(-1) %>% | ||
janitor::clean_names() | ||
|
||
names(embeds) <- stringr::str_replace_all(names(embeds), "x", "dim_") | ||
|
||
return(embeds) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
#' Quickly extract columns of interest from Hugging Face entity recognisers | ||
#' | ||
#' @param df data frame with entities extracted in JSON format | ||
#' | ||
#' @return tidier data frame for analysis. | ||
#' @export | ||
#' | ||
clean_entities <- function(df) { | ||
df %>% | ||
janitor::clean_names()%>% | ||
dplyr::rename(document = x1)%>% | ||
tidyr::pivot_longer(dplyr::contains('x'))%>% | ||
dplyr::filter(!is.na(value)) %>% | ||
tidyr::separate(value, into = c("entity_type", "entity_score","entity", "index"), sep = ",")%>% | ||
dplyr::mutate(entity_type = stringr::str_remove(entity_type, "\\{'entity_group': '"), | ||
entity_type = stringr::str_remove(entity_type, "'"), | ||
entity_score = stringr::str_remove(entity_score, " 'score': "), | ||
entity_score = stringr::str_remove(entity_score, " '"), | ||
entity = stringr::str_remove(entity, " 'word': '"), | ||
entity = stringr::str_remove(entity, "'"))%>% | ||
dplyr::mutate(entity_score = round(as.numeric(entity_score), 3))%>% | ||
dplyr::select(-name, -index) | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#' Quickly make a top terms table from UMAP/Sentence Transformers/kMeans clusters | ||
#' | ||
#' @param df Data frame object with clusters/topics | ||
#' @param group_var cluster/topic variable | ||
#' @param text_var the text variable | ||
#' | ||
#' @return a data frame with tokens counted | ||
#' @export | ||
#' | ||
make_top_terms_table <- function(df, group_var = cluster_name, text_var = mention_content){ | ||
|
||
text_quo <- rlang::enquo(text_var) | ||
text_sym <- rlang::ensym(text_var) | ||
|
||
group_quo <- rlang::enquo(group_var) | ||
group_sym <- rlang::ensym(group_var) | ||
|
||
|
||
df %>% | ||
dplyr::group_split({{group_var}}) %>% | ||
purrr::map(~.x %>% | ||
dplyr::mutate(!!text_quo := !! text_sym) %>% | ||
dplyr::mutate(!!text_quo := tm::removeNumbers(!! text_sym)) %>% | ||
LimpiaR::limpiar_url(!!text_quo) %>% | ||
LimpiaR::limpiar_spaces(!!text_quo) %>% | ||
tidytext::unnest_tokens(words, !! text_sym)) %>% | ||
purrr::reduce(bind_rows) %>% | ||
dplyr::count({{group_var}}, words, sort = TRUE) %>% | ||
dplyr::mutate(!!group_quo := stringr::str_replace_all(!! group_sym, "_", " "), | ||
!!group_quo := stringr::str_to_title(!! group_sym)) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
#' Make faceted lollipop charts for top terms from a cluster | ||
#' | ||
#' @param top_terms_table the output of make_top_terms_table() | ||
#' @param words_var the name of the variable in which word tokens have been saved | ||
#' @param group_var cluster / topic / grouping variable | ||
#' @param top_n number of terms per plot | ||
#' @param nrow number of rows to display plots on | ||
#' | ||
#' @return faceted lollipops | ||
#' @export | ||
#' | ||
top_terms_plot <- function(top_terms_table, words_var = words, group_var = cluster_name, top_n = 15, nrow = 2){ | ||
|
||
words_quo <- rlang::enquo(words_var) | ||
words_sym <- rlang::ensym(words_var) | ||
|
||
group_quo <- rlang::enquo(group_var) | ||
group_sym <- rlang::ensym(group_var) | ||
|
||
top_terms_table %>% | ||
dplyr::group_by({{group_var}}) %>% | ||
dplyr::slice_max(order_by = n, n = top_n, with_ties = FALSE) %>% | ||
dplyr::ungroup() %>% | ||
dplyr::mutate(!! words_quo := tidytext::reorder_within(!! words_sym, n, !!group_sym)) %>% | ||
ggplot2::ggplot(aes(x= {{words_var}} , y = n, fill = !! group_sym,color = !! group_sym)) + | ||
# geom_col(position = "dodge", show.legend = FALSE)+ | ||
ggplot2::geom_segment(aes(x = {{words_var}}, xend = {{words_var}}, | ||
y = 0, yend = n), | ||
show.legend = FALSE) + | ||
ggplot2::geom_point(size = 3, | ||
shape = 21, | ||
show.legend = FALSE) + | ||
ggplot2::coord_flip() + | ||
HelpR::theme_microsoft_discrete() + | ||
ggplot2::facet_wrap(ggplot2::vars(!!group_sym), | ||
scales = "free", | ||
nrow = nrow)+ | ||
ggplot2::theme_minimal()+ | ||
tidytext::scale_x_reordered("Term") + | ||
ggplot2::theme(strip.text = ggplot2::element_text(size = 12))+ | ||
ggplot2::labs(x = NULL, | ||
y = "Term Frequency") | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
#' Save a list of plots | ||
#' | ||
#' @param plot_list Should be a named list of plots | ||
#' @param plot_type For saving prefix that gets pasted with name and .png = e.g. "bigram_microsoft_topic_1.png" | ||
#' | ||
#' @return does not return anything - saves plots to working directory | ||
#' @export | ||
#' | ||
save_plot_list <- function(plot_list, plot_type = "bigram"){ | ||
lapply(names(plot_list),function(x) ggplot2::ggsave(filename = paste(plot_type, "_", x, ".png", sep = ""), bg = "white", plot = bigrams_list[[x]])) | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.