diff --git a/DESCRIPTION b/DESCRIPTION index 934dcf2..1573822 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -5,7 +5,7 @@ Description: Check available classification and regression data sets from the PM These data sets cover a range of applications, and include binary/multi-class classification problems and regression problems, as well as combinations of categorical, ordinal, and continuous features. There are currently over 150 datasets included in the PMLB repository. -Version: 0.2.3 +Version: 0.3.0 Authors@R: c( person("Trang", "Le", email = "grixor@gmail.com", role = c("aut", "cre"), comment = "https://trang.page/"), person("makeyourownmaker", email = "makeyourownmaker@gmx.com", role = "aut", comment = "https://github.com/makeyourownmaker"), @@ -22,3 +22,6 @@ URL: https://github.com/EpistasisLab/pmlbr Encoding: UTF-8 LazyData: true RoxygenNote: 7.3.2 +Suggests: + testthat (>= 3.0.0) +Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index 10d52ce..3a882a6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,5 +3,10 @@ S3method(nearest_datasets,character) S3method(nearest_datasets,data.frame) S3method(nearest_datasets,default) +export(classification_datasets) +export(dataset_names) export(fetch_data) export(nearest_datasets) +export(pmlb_metadata) +export(regression_datasets) +export(summary_stats) diff --git a/NEWS.md b/NEWS.md index f882534..b58ab4b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +# pmlbr 0.3.0 + # pmlbr 0.2.3 * Use interactive() diff --git a/R/data.R b/R/data.R deleted file mode 100644 index aea3ed0..0000000 --- a/R/data.R +++ /dev/null @@ -1,39 +0,0 @@ -#' Names of all available datasets -#' -#' A list of the names of available datasets -#' -#' @source \url{https://github.com/EpistasisLab/pmlb} -"dataset_names" - -#' Names of available classification datasets -#' -#' A list of the names of available classification datasets -#' -#' @source \url{https://github.com/EpistasisLab/pmlb} -"classification_dataset_names" - -#' Names of available regression datasets -#' -#' A list of the names of available regression datasets -#' -#' @source \url{https://github.com/EpistasisLab/pmlb} -"regression_dataset_names" - -#' Summary statistics for the all datasets -#' -#' @format A data frame with 10 variables: -#' \describe{ -#' \item{dataset:}{Dataset name} -#' \item{n_instances:}{Number of data observations (equal to number of rows)} -#' \item{n_features:}{Total number of features (number of columns - 1)} -#' \item{n_binary_features:}{Number of binary features} -#' \item{n_categorical_features:}{Number of categorical features} -#' \item{n_continuous_features:}{Number of continuous features} -#' \item{n_classes:}{Number of classes in target variable} -#' \item{endpoint_type:}{Value type of endpoint/target (can be binary, categorical or continuous)} -#' \item{imbalance:}{Imbalance metric, where zero means that the dataset is perfectly balanced and the higher the value, the more imbalanced the dataset} -#' \item{task:}{Type of problem/task. Can be classification or regression.} -#' } -#' -#' @source \url{https://github.com/EpistasisLab/pmlb} -"summary_stats" diff --git a/R/globals.R b/R/globals.R deleted file mode 100644 index 3d00df9..0000000 --- a/R/globals.R +++ /dev/null @@ -1 +0,0 @@ -utils::globalVariables(c('dataset_names', 'summary_stats')) diff --git a/R/list_datasets.R b/R/list_datasets.R new file mode 100644 index 0000000..7bdc007 --- /dev/null +++ b/R/list_datasets.R @@ -0,0 +1,100 @@ +#' Get metadata for all datasets in PMLB. +#' +#' Metadata like summary statistics and names of available datasets +#' on the PMLB repository. +#' +#' @return A list containing summary_stats, dataset_names, classification_datasets, and regression_datasets +#' @export +#' @examples +#' if (interactive()) { +#' sample(pmlb_metadata()$dataset_names, 10) +#' } +pmlb_metadata <- function() { + if (!exists("summary_stats", envir = .pmlbr_env)) { + links_to_stats <- 'https://github.com/EpistasisLab/pmlb/raw/master/pmlb/all_summary_stats.tsv' + summary_stats <- utils::read.csv(links_to_stats, sep = '\t') + colnames(summary_stats) <- tolower(gsub( + 'X.', + 'n_', + colnames(summary_stats) + )) + assign( + "summary_stats", + summary_stats, + envir = .pmlbr_env + ) + assign( + "dataset_names", + summary_stats$dataset, + envir = .pmlbr_env + ) + assign( + "regression_datasets", + sort(summary_stats[summary_stats$task == "regression", "dataset"]), + envir = .pmlbr_env + ) + assign( + "classification_datasets", + sort(summary_stats[summary_stats$task == "classification", "dataset"]), + envir = .pmlbr_env + ) + } + + list( + summary_stats = .pmlbr_env$summary_stats, + dataset_names = .pmlbr_env$dataset_names, + classification_datasets = .pmlbr_env$classification_datasets, + regression_datasets = .pmlbr_env$regression_datasets + ) +} + + +#' All available datasets +#' +#' @return A character vector of all dataset names. +#' @export +#' @examples +#' if (interactive()) { +#' sample(dataset_names(), 10) +#' } +dataset_names <- function() { + pmlb_metadata()$dataset_names +} + +#' Classification datasets +#' +#' @return A character vector of classification dataset names. +#' @export +#' @examples +#' if (interactive()) { +#' sample(classification_datasets(), 10) +#' } +classification_datasets <- function() { + pmlb_metadata()$classification_datasets +} + +#' Regression datasets +#' +#' @return A character vector of regression dataset names. +#' @export +#' @examples +#' if (interactive()) { +#' sample(regression_datasets(), 10) +#' } +regression_datasets <- function() { + pmlb_metadata()$regression_datasets +} + +#' Summary statistics +#' +#' @return A dataframe of summary statistics of all available datasets, +#' including number of instances/rows, number of columns/features, task, etc. +#' +#' @export +#' @examples +#' if (interactive()) { +#' head(summary_stats()) +#' } +summary_stats <- function() { + pmlb_metadata()$summary_stats +} diff --git a/R/nearest.R b/R/nearest.R index 336df7e..d08b515 100644 --- a/R/nearest.R +++ b/R/nearest.R @@ -10,7 +10,7 @@ #' @param n_neighbors Integer. The number of dataset names to return as neighbors. #' @param dimensions Character vector specifying dataset characteristics to include in similarity calculation. #' Dimensions must correspond to numeric columns of -#' [all_summary_stats.tsv](https://github.com/EpistasisLab/pmlb/blob/master/pmlb/all_summary_stats.tsv). +#' [all_summary_stats.tsv](https://github.com/EpistasisLab/pmlb/blob/master/pmlb/all_summarystats.tsv). #' If 'all' (default), uses all numeric columns. #' @param task Character string specifying classification or regression for summary stat generation. #' @param target_name Character string specifying column of target/dependent variable. @@ -26,14 +26,14 @@ #' nearest_datasets('penguins') #' nearest_datasets(fetch_data('penguins')) #' } -nearest_datasets <- function(x, ...){ +nearest_datasets <- function(x, ...) { UseMethod('nearest_datasets', x) } #' @rdname nearest_datasets-methods #' @export -nearest_datasets.default <- function(x, ...){ +nearest_datasets.default <- function(x, ...) { stop('`x` must be of class `data.frame` or `character`.') } @@ -41,19 +41,30 @@ nearest_datasets.default <- function(x, ...){ #' @rdname nearest_datasets-methods #' @export nearest_datasets.character <- function( - x, n_neighbors = 5, + x, + n_neighbors = 5, dimensions = c('n_instances', 'n_features'), - target_name = 'target', ...) { - - if (!(x %in% dataset_names)) - stop("'dataset_name' ", x, " not found in PMLB.\n * Check spelling, capitalisation etc.", call.=FALSE) - dataset_stats <- summary_stats[summary_stats$dataset == x, ] - - num_cols <- unlist(lapply(summary_stats, function(x) is.numeric(x)||is.integer(x))) - summary_task <- summary_stats[summary_stats$task == dataset_stats$task, ] # restrict to same task + target_name = 'target', + ... +) { + if (!(x %in% dataset_names())) + stop( + "'dataset_name' ", + x, + " not found in PMLB.\n * Check spelling, capitalisation etc.", + call. = FALSE + ) + sum_stats <- summary_stats() + dataset_stats <- sum_stats[sum_stats$dataset == x, ] + + num_cols <- unlist(lapply( + sum_stats, + function(x) is.numeric(x) || is.integer(x) + )) + summary_task <- sum_stats[sum_stats$task == dataset_stats$task, ] # restrict to same task summary_i <- summary_task[, num_cols] - if (length(dimensions) == 1 && dimensions == 'all'){ + if (length(dimensions) == 1 && dimensions == 'all') { dimensions <- colnames(summary_i) } else { stopifnot(dimensions %in% colnames(summary_i)) @@ -70,16 +81,20 @@ nearest_datasets.character <- function( #' @rdname nearest_datasets-methods #' @export nearest_datasets.data.frame <- function( - x, y = NULL, n_neighbors = 5, + x, + y = NULL, + n_neighbors = 5, dimensions = c('n_instances', 'n_features'), task = c('classification', 'regression'), - target_name = 'target', ...) { - + target_name = 'target', + ... +) { df <- if (is.null(y)) x else data.frame(x, target = y) # get summary stats for dataset - if (is.null(task)){ - task <- if (length(unique(df$target)) < 5) 'classification' else 'regression' + if (is.null(task)) { + task <- if (length(unique(df$target)) < 5) 'classification' else + 'regression' } else { task <- match.arg(task) } @@ -87,11 +102,15 @@ nearest_datasets.data.frame <- function( if (!(target_name %in% colnames(df))) stop(paste('Either x or y must contain', target_name)) - num_cols <- unlist(lapply(summary_stats, function(x) is.numeric(x)||is.integer(x))) - summary_task <- summary_stats[summary_stats$task == task, ] # restrict to same task + sum_stats <- summary_stats() + num_cols <- unlist(lapply( + sum_stats, + function(x) is.numeric(x) || is.integer(x) + )) + summary_task <- sum_stats[sum_stats$task == task, ] # restrict to same task summary_i <- summary_task[, num_cols] - if (length(dimensions) == 1 && dimensions == 'all'){ + if (length(dimensions) == 1 && dimensions == 'all') { dimensions <- colnames(summary_i) } else { stopifnot(dimensions %in% colnames(summary_i)) @@ -100,12 +119,12 @@ nearest_datasets.data.frame <- function( feat_names <- setdiff(colnames(df), target_name) types <- vector('character') - for (i in feat_names){ - types[i] <- get_type(df[,i], include_binary = TRUE) + for (i in feat_names) { + types[i] <- get_type(df[, i], include_binary = TRUE) } feat <- table(types) - for (type in c('binary', 'categorical', 'continuous')){ + for (type in c('binary', 'categorical', 'continuous')) { if (!type %in% names(feat)) feat[type] <- 0 } imb <- compute_imbalance(df[, target_name]) @@ -113,9 +132,9 @@ nearest_datasets.data.frame <- function( dataset_stats <- data.frame( n_instances = nrow(df), n_features = length(feat_names), - n_binary_features = feat['binary'], - n_categorical_features = feat['categorical'], - n_continuous_features = feat['continuous'], + n_binary_features = feat[['binary']], + n_categorical_features = feat[['categorical']], + n_continuous_features = feat[['continuous']], endpoint_type = get_type(df[, target_name]), n_classes = imb[['num_classes']], imbalance = imb[['imbalance']], @@ -136,23 +155,25 @@ nearest_datasets.data.frame <- function( #' where zero means that the dataset is perfectly balanced #' and the higher the value, the more imbalanced the dataset. #' -compute_imbalance <- function(target_col){ +compute_imbalance <- function(target_col) { imb <- 0 classes_count <- table(target_col) num_classes <- length(classes_count) - for (x in classes_count){ - p_x = x/length(target_col) + for (x in classes_count) { + p_x = x / length(target_col) } - if (p_x > 0){ - imb = imb + (p_x - 1/num_classes)*(p_x - 1/num_classes) + if (p_x > 0) { + imb = imb + (p_x - 1 / num_classes) * (p_x - 1 / num_classes) } # worst case scenario: all but 1 examplars in 1st class # the remaining one in 2nd class - worst_case <- (num_classes-1)*(1/num_classes)^2 + (1-1/num_classes)^2 + worst_case <- (num_classes - 1) * + (1 / num_classes)^2 + + (1 - 1 / num_classes)^2 - list(num_classes = num_classes, imbalance = imb/worst_case) + list(num_classes = num_classes, imbalance = imb / worst_case) } #' Get type/class of given vector. @@ -163,14 +184,17 @@ compute_imbalance <- function(target_col){ #' #' @return Type/class of `x`. #' -get_type <- function(x, include_binary = FALSE){ +get_type <- function(x, include_binary = FALSE) { x <- stats::na.omit(x) - if (inherits(x, 'numeric')){ + if (inherits(x, 'numeric')) { return('continuous') - } else if (inherits(x, 'integer') || inherits(x, 'factor')){ - if (include_binary){ - if (length(unique(x)) == 2) return('binary')} + } else if (inherits(x, 'integer') || inherits(x, 'factor')) { + if (include_binary) { + if (length(unique(x)) == 2) return('binary') + } return('categorical') - } else {stop("Cannot get types for dataset columns")} + } else { + stop("Cannot get types for dataset columns") + } } diff --git a/R/pmlb.R b/R/pmlb.R index dd374f2..a21edf4 100644 --- a/R/pmlb.R +++ b/R/pmlb.R @@ -9,7 +9,7 @@ #' @param local_cache_dir The directory on your local machine to store the data files in #' (defaults to NA, indicating cache will not be used) #' @param dropna Boolean. Whether rows with NAs should be automatically dropped. Default to TRUE. -#' @seealso \code{\link{summary_stats}}. +#' @seealso \code{\link{pmlb_metadata}}. #' @export #' @examples #' # Features and labels in single data frame @@ -23,21 +23,38 @@ #' penguins$y # vector #' } #' -fetch_data <- function(dataset_name, return_X_y = FALSE, local_cache_dir = NA, dropna = TRUE) { +fetch_data <- function( + dataset_name, + return_X_y = FALSE, + local_cache_dir = NA, + dropna = TRUE +) { GITHUB_URL <- "https://github.com/EpistasisLab/pmlb/raw/master/datasets" SUFFIX <- ".tsv.gz" - if (!dataset_name %in% dataset_names) { - stop("'dataset_name' ", dataset_name, " not found in PMLB.\n * Check spelling, capitalisation etc.", call. = FALSE) + if (!dataset_name %in% dataset_names()) { + stop( + "'dataset_name' ", + dataset_name, + " not found in PMLB.\n * Check spelling, capitalisation etc.", + call. = FALSE + ) } if (!(is.logical(return_X_y) && length(return_X_y) == 1)) { - stop("'return_X_y' must be TRUE or FALSE:\n * return_X_y is ", return_X_y, ".", call. = FALSE) + stop( + "'return_X_y' must be TRUE or FALSE:\n * return_X_y is ", + return_X_y, + ".", + call. = FALSE + ) } dataset_url <- paste0( - GITHUB_URL, "/", - dataset_name, "/", + GITHUB_URL, + "/", + dataset_name, + "/", dataset_name, SUFFIX ) @@ -63,7 +80,8 @@ fetch_data <- function(dataset_name, return_X_y = FALSE, local_cache_dir = NA, d # read file from cache if (file.exists(dataset_path)) { - dataset <- utils::read.csv(dataset_path, + dataset <- utils::read.csv( + dataset_path, sep = "\t", header = TRUE, stringsAsFactors = FALSE @@ -73,7 +91,8 @@ fetch_data <- function(dataset_name, return_X_y = FALSE, local_cache_dir = NA, d if (!graceful_download(dataset_url, dataset_path)) { message("Continuing gracefully without the dataset.") } else { - dataset <- utils::read.csv(dataset_path, + dataset <- utils::read.csv( + dataset_path, sep = "\t", header = TRUE, stringsAsFactors = FALSE @@ -97,7 +116,6 @@ fetch_data <- function(dataset_name, return_X_y = FALSE, local_cache_dir = NA, d } - #' pmlb: R interface to the Penn Machine Learning Benchmarks data repository #' #' The \href{https://github.com/EpistasisLab/pmlb}{PMLB} repository contains a curated collection of data sets for evaluating and @@ -110,7 +128,7 @@ fetch_data <- function(dataset_name, return_X_y = FALSE, local_cache_dir = NA, d #' include any of the PMLB data sets. The data sets can be downloaded using the \code{\link{fetch_data}} function which #' is similar to the corresponding PMLB python function. #' -#' See \code{\link{fetch_data}}, \code{\link{summary_stats}} for usage examples and further information. +#' See \code{\link{fetch_data}}, \code{\link{pmlb_metadata}} for usage examples and further information. #' #' If you use PMLB in a scientific publication, please consider citing the following paper: #' diff --git a/R/zzz.R b/R/zzz.R new file mode 100644 index 0000000..f24f10c --- /dev/null +++ b/R/zzz.R @@ -0,0 +1 @@ +.pmlbr_env <- new.env(parent = emptyenv()) diff --git a/README.Rmd b/README.Rmd index cb14069..0226b49 100644 --- a/README.Rmd +++ b/README.Rmd @@ -66,18 +66,19 @@ Let's check other available datasets and their summary statistics: ``` {r} # Dataset names -head(classification_dataset_names, 9) -head(regression_dataset_names, 9) +sample(classification_datasets(), 9) +sample(regression_datasets(), 9) # Dataset summaries -head(summary_stats) +sum_stats <- summary_stats() +head(sum_stats) ``` Selecting a subset of datasets that satisfy certain conditions is straight forward with `dplyr`. For example, if we need datasets with fewer than 100 observations for a classification task: ```{r warning=FALSE, message=FALSE} library(dplyr) -summary_stats %>% +sum_stats %>% filter(n_instances < 100, task == "classification") %>% pull(dataset) ``` diff --git a/README.md b/README.md index 42863e6..6386202 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,11 @@ library(pmlbr) # Download features and labels for penguins dataset in single data frame penguins <- fetch_data("penguins") +``` + + ## Download successful. + +``` r str(penguins) ``` @@ -73,6 +78,11 @@ str(penguins) ``` r # Download features and labels for penguins dataset in separate data structures penguins <- fetch_data("penguins", return_X_y = TRUE) +``` + + ## Download successful. + +``` r head(penguins$x) # data frame ``` @@ -94,40 +104,45 @@ Let’s check other available datasets and their summary statistics: ``` r # Dataset names -head(classification_dataset_names, 9) +sample(classification_datasets(), 9) ``` - ## [1] "adult" "agaricus_lepiota" "allbp" - ## [4] "allhyper" "allhypo" "allrep" - ## [7] "analcatdata_aids" "analcatdata_asbestos" "analcatdata_authorship" + ## [1] "heart_disease_hungarian" "fars" + ## [3] "allrep" "_deprecated_colic" + ## [5] "Hill_Valley_without_noise" "_deprecated_german" + ## [7] "sleep" "_deprecated_cleveland_nominal" + ## [9] "analcatdata_happiness" ``` r -head(regression_dataset_names, 9) +sample(regression_datasets(), 9) ``` - ## [1] "1027_ESL" "1028_SWD" "1029_LEV" - ## [4] "1030_ERA" "1089_USCrime" "1096_FacultySalaries" - ## [7] "1191_BNG_pbc" "1193_BNG_lowbwt" "1196_BNG_pharynx" + ## [1] "527_analcatdata_election2000" "1089_USCrime" + ## [3] "feynman_III_8_54" "225_puma8NH" + ## [5] "657_fri_c2_250_10" "strogatz_glider2" + ## [7] "611_fri_c3_100_5" "586_fri_c3_1000_25" + ## [9] "650_fri_c0_500_50" ``` r # Dataset summaries -head(summary_stats) +sum_stats <- summary_stats() +head(sum_stats) ``` ## dataset n_instances n_features n_binary_features ## 1 1027_ESL 488 4 0 - ## 2 1028_SWD 1000 10 0 + ## 2 1028_SWD 1000 10 1 ## 3 1029_LEV 1000 4 0 ## 4 1030_ERA 1000 4 0 - ## 5 1089_USCrime 47 13 0 - ## 6 1096_FacultySalaries 50 4 0 + ## 5 1089_USCrime 47 13 1 + ## 6 1096_FacultySalaries 50 4 1 ## n_categorical_features n_continuous_features endpoint_type n_classes - ## 1 0 4 continuous 9 - ## 2 0 10 continuous 4 - ## 3 0 4 continuous 5 + ## 1 4 0 continuous 9 + ## 2 9 0 continuous 4 + ## 3 4 0 continuous 5 ## 4 0 4 continuous 9 - ## 5 0 13 continuous 42 - ## 6 0 4 continuous 39 + ## 5 0 12 continuous 42 + ## 6 0 3 continuous 39 ## imbalance task ## 1 0.099363200 regression ## 2 0.108290667 regression @@ -142,7 +157,7 @@ fewer than 100 observations for a classification task: ``` r library(dplyr) -summary_stats %>% +sum_stats %>% filter(n_instances < 100, task == "classification") %>% pull(dataset) ``` diff --git a/data-raw/get-summary.R b/data-raw/get-summary.R deleted file mode 100644 index e849738..0000000 --- a/data-raw/get-summary.R +++ /dev/null @@ -1,12 +0,0 @@ -links_to_stats <- 'https://github.com/EpistasisLab/pmlb/raw/master/pmlb/all_summary_stats.tsv' -summary_stats <- read.csv(links_to_stats, sep = '\t') -colnames(summary_stats) <- tolower(gsub('X.', 'n_', colnames(summary_stats))) -dataset_names <- summary_stats$dataset - -regression_dataset_names <- sort(subset(summary_stats, task == 'regression')$dataset) -classification_dataset_names <- sort(subset(summary_stats, task == 'classification')$dataset) - -usethis::use_data(summary_stats, dataset_names, - classification_dataset_names, - regression_dataset_names, - overwrite = TRUE) diff --git a/data/classification_dataset_names.rda b/data/classification_dataset_names.rda deleted file mode 100644 index 87951c8..0000000 Binary files a/data/classification_dataset_names.rda and /dev/null differ diff --git a/data/dataset_names.rda b/data/dataset_names.rda deleted file mode 100644 index 0f831e9..0000000 Binary files a/data/dataset_names.rda and /dev/null differ diff --git a/data/regression_dataset_names.rda b/data/regression_dataset_names.rda deleted file mode 100644 index ab1e042..0000000 Binary files a/data/regression_dataset_names.rda and /dev/null differ diff --git a/data/summary_stats.rda b/data/summary_stats.rda deleted file mode 100644 index b8050c6..0000000 Binary files a/data/summary_stats.rda and /dev/null differ diff --git a/man/classification_dataset_names.Rd b/man/classification_dataset_names.Rd deleted file mode 100644 index b682290..0000000 --- a/man/classification_dataset_names.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{classification_dataset_names} -\alias{classification_dataset_names} -\title{Names of available classification datasets} -\format{ -An object of class \code{character} of length 162. -} -\source{ -\url{https://github.com/EpistasisLab/pmlb} -} -\usage{ -classification_dataset_names -} -\description{ -A list of the names of available classification datasets -} -\keyword{datasets} diff --git a/man/classification_datasets.Rd b/man/classification_datasets.Rd new file mode 100644 index 0000000..6625ca4 --- /dev/null +++ b/man/classification_datasets.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/list_datasets.R +\name{classification_datasets} +\alias{classification_datasets} +\title{Classification datasets} +\usage{ +classification_datasets() +} +\value{ +A character vector of classification dataset names. +} +\description{ +Classification datasets +} +\examples{ +if (interactive()) { + sample(classification_datasets(), 10) +} +} diff --git a/man/dataset_names.Rd b/man/dataset_names.Rd index 1692325..1b1f85a 100644 --- a/man/dataset_names.Rd +++ b/man/dataset_names.Rd @@ -1,19 +1,19 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} +% Please edit documentation in R/list_datasets.R \name{dataset_names} \alias{dataset_names} -\title{Names of all available datasets} -\format{ -An object of class \code{character} of length 284. -} -\source{ -\url{https://github.com/EpistasisLab/pmlb} -} +\title{All available datasets} \usage{ -dataset_names +dataset_names() +} +\value{ +A character vector of all dataset names. } \description{ -A list of the names of available datasets +All available datasets +} +\examples{ +if (interactive()) { + sample(dataset_names(), 10) +} } -\keyword{datasets} diff --git a/man/fetch_data.Rd b/man/fetch_data.Rd index 9f7da14..cbdc9f5 100644 --- a/man/fetch_data.Rd +++ b/man/fetch_data.Rd @@ -40,5 +40,5 @@ if (interactive()){ } \seealso{ -\code{\link{summary_stats}}. +\code{\link{pmlb_metadata}}. } diff --git a/man/nearest_datasets-methods.Rd b/man/nearest_datasets-methods.Rd index 143e36e..c9d7ea7 100644 --- a/man/nearest_datasets-methods.Rd +++ b/man/nearest_datasets-methods.Rd @@ -39,7 +39,7 @@ or data.frame of n_samples x n_features(or n_features+1 with a target column)} \item{dimensions}{Character vector specifying dataset characteristics to include in similarity calculation. Dimensions must correspond to numeric columns of -[all_summary_stats.tsv](https://github.com/EpistasisLab/pmlb/blob/master/pmlb/all_summary_stats.tsv). +[all_summary_stats.tsv](https://github.com/EpistasisLab/pmlb/blob/master/pmlb/all_summarystats.tsv). If 'all' (default), uses all numeric columns.} \item{target_name}{Character string specifying column of target/dependent variable.} diff --git a/man/pmlb.Rd b/man/pmlb.Rd index be252e6..61d7511 100644 --- a/man/pmlb.Rd +++ b/man/pmlb.Rd @@ -17,7 +17,7 @@ This R library includes summaries of the classification and regression data sets include any of the PMLB data sets. The data sets can be downloaded using the \code{\link{fetch_data}} function which is similar to the corresponding PMLB python function. -See \code{\link{fetch_data}}, \code{\link{summary_stats}} for usage examples and further information. +See \code{\link{fetch_data}}, \code{\link{pmlb_metadata}} for usage examples and further information. If you use PMLB in a scientific publication, please consider citing the following paper: diff --git a/man/pmlb_metadata.Rd b/man/pmlb_metadata.Rd new file mode 100644 index 0000000..9b6fa96 --- /dev/null +++ b/man/pmlb_metadata.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/list_datasets.R +\name{pmlb_metadata} +\alias{pmlb_metadata} +\title{Get metadata for all datasets in PMLB.} +\usage{ +pmlb_metadata() +} +\value{ +A list containing summary_stats, dataset_names, classification_datasets, and regression_datasets +} +\description{ +Metadata like summary statistics and names of available datasets +on the PMLB repository. +} +\examples{ +if (interactive()) { + sample(pmlb_metadata()$dataset_names, 10) +} +} diff --git a/man/regression_dataset_names.Rd b/man/regression_dataset_names.Rd deleted file mode 100644 index c28c42b..0000000 --- a/man/regression_dataset_names.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} -\name{regression_dataset_names} -\alias{regression_dataset_names} -\title{Names of available regression datasets} -\format{ -An object of class \code{character} of length 122. -} -\source{ -\url{https://github.com/EpistasisLab/pmlb} -} -\usage{ -regression_dataset_names -} -\description{ -A list of the names of available regression datasets -} -\keyword{datasets} diff --git a/man/regression_datasets.Rd b/man/regression_datasets.Rd new file mode 100644 index 0000000..c7c4abe --- /dev/null +++ b/man/regression_datasets.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/list_datasets.R +\name{regression_datasets} +\alias{regression_datasets} +\title{Regression datasets} +\usage{ +regression_datasets() +} +\value{ +A character vector of regression dataset names. +} +\description{ +Regression datasets +} +\examples{ +if (interactive()) { + sample(regression_datasets(), 10) +} +} diff --git a/man/summary_stats.Rd b/man/summary_stats.Rd index 6f8539f..638f319 100644 --- a/man/summary_stats.Rd +++ b/man/summary_stats.Rd @@ -1,31 +1,20 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data.R -\docType{data} +% Please edit documentation in R/list_datasets.R \name{summary_stats} \alias{summary_stats} -\title{Summary statistics for the all datasets} -\format{ -A data frame with 10 variables: -\describe{ - \item{dataset:}{Dataset name} - \item{n_instances:}{Number of data observations (equal to number of rows)} - \item{n_features:}{Total number of features (number of columns - 1)} - \item{n_binary_features:}{Number of binary features} - \item{n_categorical_features:}{Number of categorical features} - \item{n_continuous_features:}{Number of continuous features} - \item{n_classes:}{Number of classes in target variable} - \item{endpoint_type:}{Value type of endpoint/target (can be binary, categorical or continuous)} - \item{imbalance:}{Imbalance metric, where zero means that the dataset is perfectly balanced and the higher the value, the more imbalanced the dataset} - \item{task:}{Type of problem/task. Can be classification or regression.} +\title{Summary statistics} +\usage{ +summary_stats() } +\value{ +A dataframe of summary statistics of all available datasets, +including number of instances/rows, number of columns/features, task, etc. } -\source{ -\url{https://github.com/EpistasisLab/pmlb} +\description{ +Summary statistics } -\usage{ -summary_stats +\examples{ +if (interactive()) { + head(summary_stats()) } -\description{ -Summary statistics for the all datasets } -\keyword{datasets} diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..af3cfcb --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,12 @@ +# This file is part of the standard setup for testthat. +# It is recommended that you do not modify it. +# +# Where should you do additional test configuration? +# Learn more about the roles of various files in: +# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview +# * https://testthat.r-lib.org/articles/special-files.html + +library(testthat) +library(pmlbr) + +test_check("pmlbr") diff --git a/tests/testthat/test-list_datasets.R b/tests/testthat/test-list_datasets.R new file mode 100644 index 0000000..6c27db4 --- /dev/null +++ b/tests/testthat/test-list_datasets.R @@ -0,0 +1,12 @@ +test_that("pmlb_metadata works as expected", { + skip_on_cran() + data <- pmlb_metadata() + expect_true(is.data.frame(data$summary_stats)) + expect_true(is.character(data$dataset_names)) + expect_true(is.character(data$classification_datasets)) + expect_true(is.character(data$regression_datasets)) + + expect_true("penguins" %in% dataset_names()) + expect_true("penguins" %in% classification_datasets()) + expect_true("1089_USCrime" %in% regression_datasets()) +}) diff --git a/tests/testthat/test-nearest_datasets.R b/tests/testthat/test-nearest_datasets.R new file mode 100644 index 0000000..41a2bd5 --- /dev/null +++ b/tests/testthat/test-nearest_datasets.R @@ -0,0 +1,16 @@ +test_that("nearest dataset is itself", { + skip_on_cran() + expect_equal( + nearest_datasets("penguins")[[1]], + "penguins" + ) + expect_equal( + nearest_datasets(fetch_data("lupus"))[[1]], + "lupus" + ) + + expect_equal( + nearest_datasets("1089_USCrime")[[1]], + "1089_USCrime" + ) +})