diff --git a/DESCRIPTION b/DESCRIPTION
index 934dcf2..1573822 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -5,7 +5,7 @@ Description: Check available classification and regression data sets from the PM
     These data sets cover a range of applications, and include binary/multi-class classification problems and 
     regression problems, as well as combinations of categorical, ordinal, and continuous features.
     There are currently over 150 datasets included in the PMLB repository.
-Version: 0.2.3
+Version: 0.3.0
 Authors@R: c(
     person("Trang", "Le", email = "grixor@gmail.com", role = c("aut", "cre"), comment = "https://trang.page/"),
     person("makeyourownmaker", email = "makeyourownmaker@gmx.com", role = "aut", comment = "https://github.com/makeyourownmaker"),
@@ -22,3 +22,6 @@ URL: https://github.com/EpistasisLab/pmlbr
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.3.2
+Suggests: 
+    testthat (>= 3.0.0)
+Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
index 10d52ce..3a882a6 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -3,5 +3,10 @@
 S3method(nearest_datasets,character)
 S3method(nearest_datasets,data.frame)
 S3method(nearest_datasets,default)
+export(classification_datasets)
+export(dataset_names)
 export(fetch_data)
 export(nearest_datasets)
+export(pmlb_metadata)
+export(regression_datasets)
+export(summary_stats)
diff --git a/NEWS.md b/NEWS.md
index f882534..b58ab4b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,5 @@
+# pmlbr 0.3.0
+
 # pmlbr 0.2.3
 
 * Use interactive()
diff --git a/R/data.R b/R/data.R
deleted file mode 100644
index aea3ed0..0000000
--- a/R/data.R
+++ /dev/null
@@ -1,39 +0,0 @@
-#' Names of all available datasets
-#'
-#' A list of the names of available datasets
-#'
-#' @source \url{https://github.com/EpistasisLab/pmlb}
-"dataset_names"
-
-#' Names of available classification datasets
-#'
-#' A list of the names of available classification datasets
-#'
-#' @source \url{https://github.com/EpistasisLab/pmlb}
-"classification_dataset_names"
-
-#' Names of available regression datasets
-#'
-#' A list of the names of available regression datasets
-#'
-#' @source \url{https://github.com/EpistasisLab/pmlb}
-"regression_dataset_names"
-
-#' Summary statistics for the all datasets
-#'
-#' @format A data frame with 10 variables:
-#' \describe{
-#'   \item{dataset:}{Dataset name}
-#'   \item{n_instances:}{Number of data observations (equal to number of rows)}
-#'   \item{n_features:}{Total number of features (number of columns - 1)}
-#'   \item{n_binary_features:}{Number of binary features}
-#'   \item{n_categorical_features:}{Number of categorical features}
-#'   \item{n_continuous_features:}{Number of continuous features}
-#'   \item{n_classes:}{Number of classes in target variable}
-#'   \item{endpoint_type:}{Value type of endpoint/target (can be binary, categorical or continuous)}
-#'   \item{imbalance:}{Imbalance metric, where zero means that the dataset is perfectly balanced and the higher the value, the more imbalanced the dataset}
-#'   \item{task:}{Type of problem/task. Can be classification or regression.}
-#' }
-#'
-#' @source \url{https://github.com/EpistasisLab/pmlb}
-"summary_stats"
diff --git a/R/globals.R b/R/globals.R
deleted file mode 100644
index 3d00df9..0000000
--- a/R/globals.R
+++ /dev/null
@@ -1 +0,0 @@
-utils::globalVariables(c('dataset_names', 'summary_stats'))
diff --git a/R/list_datasets.R b/R/list_datasets.R
new file mode 100644
index 0000000..7bdc007
--- /dev/null
+++ b/R/list_datasets.R
@@ -0,0 +1,100 @@
+#' Get metadata for all datasets in PMLB.
+#'
+#' Metadata like summary statistics and names of available datasets
+#' on the PMLB repository.
+#'
+#' @return A list containing summary_stats, dataset_names, classification_datasets, and regression_datasets
+#' @export
+#' @examples
+#' if (interactive()) {
+#'   sample(pmlb_metadata()$dataset_names, 10)
+#' }
+pmlb_metadata <- function() {
+  if (!exists("summary_stats", envir = .pmlbr_env)) {
+    links_to_stats <- 'https://github.com/EpistasisLab/pmlb/raw/master/pmlb/all_summary_stats.tsv'
+    summary_stats <- utils::read.csv(links_to_stats, sep = '\t')
+    colnames(summary_stats) <- tolower(gsub(
+      'X.',
+      'n_',
+      colnames(summary_stats)
+    ))
+    assign(
+      "summary_stats",
+      summary_stats,
+      envir = .pmlbr_env
+    )
+    assign(
+      "dataset_names",
+      summary_stats$dataset,
+      envir = .pmlbr_env
+    )
+    assign(
+      "regression_datasets",
+      sort(summary_stats[summary_stats$task == "regression", "dataset"]),
+      envir = .pmlbr_env
+    )
+    assign(
+      "classification_datasets",
+      sort(summary_stats[summary_stats$task == "classification", "dataset"]),
+      envir = .pmlbr_env
+    )
+  }
+
+  list(
+    summary_stats = .pmlbr_env$summary_stats,
+    dataset_names = .pmlbr_env$dataset_names,
+    classification_datasets = .pmlbr_env$classification_datasets,
+    regression_datasets = .pmlbr_env$regression_datasets
+  )
+}
+
+
+#' All available datasets
+#'
+#' @return A character vector of all dataset names.
+#' @export
+#' @examples
+#' if (interactive()) {
+#'   sample(dataset_names(), 10)
+#' }
+dataset_names <- function() {
+  pmlb_metadata()$dataset_names
+}
+
+#' Classification datasets
+#'
+#' @return A character vector of classification dataset names.
+#' @export
+#' @examples
+#' if (interactive()) {
+#'   sample(classification_datasets(), 10)
+#' }
+classification_datasets <- function() {
+  pmlb_metadata()$classification_datasets
+}
+
+#' Regression datasets
+#'
+#' @return A character vector of regression dataset names.
+#' @export
+#' @examples
+#' if (interactive()) {
+#'   sample(regression_datasets(), 10)
+#' }
+regression_datasets <- function() {
+  pmlb_metadata()$regression_datasets
+}
+
+#' Summary statistics
+#'
+#' @return A dataframe of summary statistics of all available datasets,
+#' including number of instances/rows, number of columns/features, task, etc.
+#'
+#' @export
+#' @examples
+#' if (interactive()) {
+#'   head(summary_stats())
+#' }
+summary_stats <- function() {
+  pmlb_metadata()$summary_stats
+}
diff --git a/R/nearest.R b/R/nearest.R
index 336df7e..d08b515 100644
--- a/R/nearest.R
+++ b/R/nearest.R
@@ -10,7 +10,7 @@
 #' @param n_neighbors Integer. The number of dataset names to return as neighbors.
 #' @param dimensions Character vector specifying dataset characteristics to include in similarity calculation.
 #' Dimensions must correspond to numeric columns of
-#' [all_summary_stats.tsv](https://github.com/EpistasisLab/pmlb/blob/master/pmlb/all_summary_stats.tsv).
+#' [all_summary_stats.tsv](https://github.com/EpistasisLab/pmlb/blob/master/pmlb/all_summarystats.tsv).
 #' If 'all' (default), uses all numeric columns.
 #' @param task Character string specifying classification or regression for summary stat generation.
 #' @param target_name Character string specifying column of target/dependent variable.
@@ -26,14 +26,14 @@
 #'   nearest_datasets('penguins')
 #'   nearest_datasets(fetch_data('penguins'))
 #' }
-nearest_datasets <- function(x, ...){
+nearest_datasets <- function(x, ...) {
   UseMethod('nearest_datasets', x)
 }
 
 
 #' @rdname nearest_datasets-methods
 #' @export
-nearest_datasets.default <- function(x, ...){
+nearest_datasets.default <- function(x, ...) {
   stop('`x` must be of class `data.frame` or `character`.')
 }
 
@@ -41,19 +41,30 @@ nearest_datasets.default <- function(x, ...){
 #' @rdname nearest_datasets-methods
 #' @export
 nearest_datasets.character <- function(
-  x, n_neighbors = 5,
+  x,
+  n_neighbors = 5,
   dimensions = c('n_instances', 'n_features'),
-  target_name = 'target', ...) {
-
-  if (!(x %in% dataset_names))
-    stop("'dataset_name' ", x, " not found in PMLB.\n * Check spelling, capitalisation etc.", call.=FALSE)
-  dataset_stats <- summary_stats[summary_stats$dataset == x, ]
-
-  num_cols <- unlist(lapply(summary_stats, function(x) is.numeric(x)||is.integer(x)))
-  summary_task <- summary_stats[summary_stats$task == dataset_stats$task, ] # restrict to same task
+  target_name = 'target',
+  ...
+) {
+  if (!(x %in% dataset_names()))
+    stop(
+      "'dataset_name' ",
+      x,
+      " not found in PMLB.\n * Check spelling, capitalisation etc.",
+      call. = FALSE
+    )
+  sum_stats <- summary_stats()
+  dataset_stats <- sum_stats[sum_stats$dataset == x, ]
+
+  num_cols <- unlist(lapply(
+    sum_stats,
+    function(x) is.numeric(x) || is.integer(x)
+  ))
+  summary_task <- sum_stats[sum_stats$task == dataset_stats$task, ] # restrict to same task
   summary_i <- summary_task[, num_cols]
 
-  if (length(dimensions) == 1 && dimensions == 'all'){
+  if (length(dimensions) == 1 && dimensions == 'all') {
     dimensions <- colnames(summary_i)
   } else {
     stopifnot(dimensions %in% colnames(summary_i))
@@ -70,16 +81,20 @@ nearest_datasets.character <- function(
 #' @rdname nearest_datasets-methods
 #' @export
 nearest_datasets.data.frame <- function(
-  x, y = NULL, n_neighbors = 5,
+  x,
+  y = NULL,
+  n_neighbors = 5,
   dimensions = c('n_instances', 'n_features'),
   task = c('classification', 'regression'),
-  target_name = 'target', ...) {
-
+  target_name = 'target',
+  ...
+) {
   df <- if (is.null(y)) x else data.frame(x, target = y)
 
   # get summary stats for dataset
-  if (is.null(task)){
-    task <- if (length(unique(df$target)) < 5) 'classification' else 'regression'
+  if (is.null(task)) {
+    task <- if (length(unique(df$target)) < 5) 'classification' else
+      'regression'
   } else {
     task <- match.arg(task)
   }
@@ -87,11 +102,15 @@ nearest_datasets.data.frame <- function(
   if (!(target_name %in% colnames(df)))
     stop(paste('Either x or y must contain', target_name))
 
-  num_cols <- unlist(lapply(summary_stats, function(x) is.numeric(x)||is.integer(x)))
-  summary_task <- summary_stats[summary_stats$task == task, ] # restrict to same task
+  sum_stats <- summary_stats()
+  num_cols <- unlist(lapply(
+    sum_stats,
+    function(x) is.numeric(x) || is.integer(x)
+  ))
+  summary_task <- sum_stats[sum_stats$task == task, ] # restrict to same task
   summary_i <- summary_task[, num_cols]
 
-  if (length(dimensions) == 1 && dimensions == 'all'){
+  if (length(dimensions) == 1 && dimensions == 'all') {
     dimensions <- colnames(summary_i)
   } else {
     stopifnot(dimensions %in% colnames(summary_i))
@@ -100,12 +119,12 @@ nearest_datasets.data.frame <- function(
 
   feat_names <- setdiff(colnames(df), target_name)
   types <- vector('character')
-  for (i in feat_names){
-    types[i] <- get_type(df[,i], include_binary = TRUE)
+  for (i in feat_names) {
+    types[i] <- get_type(df[, i], include_binary = TRUE)
   }
 
   feat <- table(types)
-  for (type in c('binary', 'categorical', 'continuous')){
+  for (type in c('binary', 'categorical', 'continuous')) {
     if (!type %in% names(feat)) feat[type] <- 0
   }
   imb <- compute_imbalance(df[, target_name])
@@ -113,9 +132,9 @@ nearest_datasets.data.frame <- function(
   dataset_stats <- data.frame(
     n_instances = nrow(df),
     n_features = length(feat_names),
-    n_binary_features = feat['binary'],
-    n_categorical_features = feat['categorical'],
-    n_continuous_features = feat['continuous'],
+    n_binary_features = feat[['binary']],
+    n_categorical_features = feat[['categorical']],
+    n_continuous_features = feat[['continuous']],
     endpoint_type = get_type(df[, target_name]),
     n_classes = imb[['num_classes']],
     imbalance = imb[['imbalance']],
@@ -136,23 +155,25 @@ nearest_datasets.data.frame <- function(
 #' where zero means that the dataset is perfectly balanced
 #' and the higher the value, the more imbalanced the dataset.
 #'
-compute_imbalance <- function(target_col){
+compute_imbalance <- function(target_col) {
   imb <- 0
   classes_count <- table(target_col)
   num_classes <- length(classes_count)
-  for (x in classes_count){
-    p_x = x/length(target_col)
+  for (x in classes_count) {
+    p_x = x / length(target_col)
   }
 
-  if (p_x > 0){
-    imb = imb + (p_x - 1/num_classes)*(p_x - 1/num_classes)
+  if (p_x > 0) {
+    imb = imb + (p_x - 1 / num_classes) * (p_x - 1 / num_classes)
   }
 
   # worst case scenario: all but 1 examplars in 1st class
   # the remaining one in 2nd class
-  worst_case <- (num_classes-1)*(1/num_classes)^2 + (1-1/num_classes)^2
+  worst_case <- (num_classes - 1) *
+    (1 / num_classes)^2 +
+    (1 - 1 / num_classes)^2
 
-  list(num_classes = num_classes, imbalance = imb/worst_case)
+  list(num_classes = num_classes, imbalance = imb / worst_case)
 }
 
 #' Get type/class of given vector.
@@ -163,14 +184,17 @@ compute_imbalance <- function(target_col){
 #'
 #' @return Type/class of `x`.
 #'
-get_type <- function(x, include_binary = FALSE){
+get_type <- function(x, include_binary = FALSE) {
   x <- stats::na.omit(x)
 
-  if (inherits(x, 'numeric')){
+  if (inherits(x, 'numeric')) {
     return('continuous')
-  } else if (inherits(x, 'integer') || inherits(x, 'factor')){
-    if (include_binary){
-      if (length(unique(x)) == 2) return('binary')}
+  } else if (inherits(x, 'integer') || inherits(x, 'factor')) {
+    if (include_binary) {
+      if (length(unique(x)) == 2) return('binary')
+    }
     return('categorical')
-  } else {stop("Cannot get types for dataset columns")}
+  } else {
+    stop("Cannot get types for dataset columns")
+  }
 }
diff --git a/R/pmlb.R b/R/pmlb.R
index dd374f2..a21edf4 100644
--- a/R/pmlb.R
+++ b/R/pmlb.R
@@ -9,7 +9,7 @@
 #' @param local_cache_dir The directory on your local machine to store the data files in
 #' (defaults to NA, indicating cache will not be used)
 #' @param dropna Boolean. Whether rows with NAs should be automatically dropped. Default to TRUE.
-#' @seealso \code{\link{summary_stats}}.
+#' @seealso \code{\link{pmlb_metadata}}.
 #' @export
 #' @examples
 #' # Features and labels in single data frame
@@ -23,21 +23,38 @@
 #'   penguins$y # vector
 #' }
 #'
-fetch_data <- function(dataset_name, return_X_y = FALSE, local_cache_dir = NA, dropna = TRUE) {
+fetch_data <- function(
+  dataset_name,
+  return_X_y = FALSE,
+  local_cache_dir = NA,
+  dropna = TRUE
+) {
   GITHUB_URL <- "https://github.com/EpistasisLab/pmlb/raw/master/datasets"
   SUFFIX <- ".tsv.gz"
 
-  if (!dataset_name %in% dataset_names) {
-    stop("'dataset_name' ", dataset_name, " not found in PMLB.\n * Check spelling, capitalisation etc.", call. = FALSE)
+  if (!dataset_name %in% dataset_names()) {
+    stop(
+      "'dataset_name' ",
+      dataset_name,
+      " not found in PMLB.\n * Check spelling, capitalisation etc.",
+      call. = FALSE
+    )
   }
 
   if (!(is.logical(return_X_y) && length(return_X_y) == 1)) {
-    stop("'return_X_y' must be TRUE or FALSE:\n * return_X_y is ", return_X_y, ".", call. = FALSE)
+    stop(
+      "'return_X_y' must be TRUE or FALSE:\n * return_X_y is ",
+      return_X_y,
+      ".",
+      call. = FALSE
+    )
   }
 
   dataset_url <- paste0(
-    GITHUB_URL, "/",
-    dataset_name, "/",
+    GITHUB_URL,
+    "/",
+    dataset_name,
+    "/",
     dataset_name,
     SUFFIX
   )
@@ -63,7 +80,8 @@ fetch_data <- function(dataset_name, return_X_y = FALSE, local_cache_dir = NA, d
 
     # read file from cache
     if (file.exists(dataset_path)) {
-      dataset <- utils::read.csv(dataset_path,
+      dataset <- utils::read.csv(
+        dataset_path,
         sep = "\t",
         header = TRUE,
         stringsAsFactors = FALSE
@@ -73,7 +91,8 @@ fetch_data <- function(dataset_name, return_X_y = FALSE, local_cache_dir = NA, d
       if (!graceful_download(dataset_url, dataset_path)) {
         message("Continuing gracefully without the dataset.")
       } else {
-        dataset <- utils::read.csv(dataset_path,
+        dataset <- utils::read.csv(
+          dataset_path,
           sep = "\t",
           header = TRUE,
           stringsAsFactors = FALSE
@@ -97,7 +116,6 @@ fetch_data <- function(dataset_name, return_X_y = FALSE, local_cache_dir = NA, d
 }
 
 
-
 #' pmlb: R interface to the Penn Machine Learning Benchmarks data repository
 #'
 #' The \href{https://github.com/EpistasisLab/pmlb}{PMLB} repository contains a curated collection of data sets for evaluating and
@@ -110,7 +128,7 @@ fetch_data <- function(dataset_name, return_X_y = FALSE, local_cache_dir = NA, d
 #' include any of the PMLB data sets.  The data sets can be downloaded using the \code{\link{fetch_data}} function which
 #' is similar to the corresponding PMLB python function.
 #'
-#' See \code{\link{fetch_data}}, \code{\link{summary_stats}} for usage examples and further information.
+#' See \code{\link{fetch_data}}, \code{\link{pmlb_metadata}} for usage examples and further information.
 #'
 #' If you use PMLB in a scientific publication, please consider citing the following paper:
 #'
diff --git a/R/zzz.R b/R/zzz.R
new file mode 100644
index 0000000..f24f10c
--- /dev/null
+++ b/R/zzz.R
@@ -0,0 +1 @@
+.pmlbr_env <- new.env(parent = emptyenv())
diff --git a/README.Rmd b/README.Rmd
index cb14069..0226b49 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -66,18 +66,19 @@ Let's check other available datasets and their summary statistics:
 
 ``` {r}
 # Dataset names
-head(classification_dataset_names, 9)
-head(regression_dataset_names, 9)
+sample(classification_datasets(), 9)
+sample(regression_datasets(), 9)
 
 # Dataset summaries
-head(summary_stats)
+sum_stats <- summary_stats()
+head(sum_stats)
 ```
 
 Selecting a subset of datasets that satisfy certain conditions is straight forward with `dplyr`.
 For example, if we need datasets with fewer than 100 observations for a classification task: 
 ```{r warning=FALSE, message=FALSE}
 library(dplyr)
-summary_stats %>%
+sum_stats %>%
   filter(n_instances < 100, task == "classification") %>%
   pull(dataset)
 ```
diff --git a/README.md b/README.md
index 42863e6..6386202 100644
--- a/README.md
+++ b/README.md
@@ -55,6 +55,11 @@ library(pmlbr)
 
 # Download features and labels for penguins dataset in single data frame
 penguins <- fetch_data("penguins")
+```
+
+    ## Download successful.
+
+``` r
 str(penguins)
 ```
 
@@ -73,6 +78,11 @@ str(penguins)
 ``` r
 # Download features and labels for penguins dataset in separate data structures
 penguins <- fetch_data("penguins", return_X_y = TRUE)
+```
+
+    ## Download successful.
+
+``` r
 head(penguins$x) # data frame
 ```
 
@@ -94,40 +104,45 @@ Let’s check other available datasets and their summary statistics:
 
 ``` r
 # Dataset names
-head(classification_dataset_names, 9)
+sample(classification_datasets(), 9)
 ```
 
-    ## [1] "adult"                  "agaricus_lepiota"       "allbp"                 
-    ## [4] "allhyper"               "allhypo"                "allrep"                
-    ## [7] "analcatdata_aids"       "analcatdata_asbestos"   "analcatdata_authorship"
+    ## [1] "heart_disease_hungarian"       "fars"                         
+    ## [3] "allrep"                        "_deprecated_colic"            
+    ## [5] "Hill_Valley_without_noise"     "_deprecated_german"           
+    ## [7] "sleep"                         "_deprecated_cleveland_nominal"
+    ## [9] "analcatdata_happiness"
 
 ``` r
-head(regression_dataset_names, 9)
+sample(regression_datasets(), 9)
 ```
 
-    ## [1] "1027_ESL"             "1028_SWD"             "1029_LEV"            
-    ## [4] "1030_ERA"             "1089_USCrime"         "1096_FacultySalaries"
-    ## [7] "1191_BNG_pbc"         "1193_BNG_lowbwt"      "1196_BNG_pharynx"
+    ## [1] "527_analcatdata_election2000" "1089_USCrime"                
+    ## [3] "feynman_III_8_54"             "225_puma8NH"                 
+    ## [5] "657_fri_c2_250_10"            "strogatz_glider2"            
+    ## [7] "611_fri_c3_100_5"             "586_fri_c3_1000_25"          
+    ## [9] "650_fri_c0_500_50"
 
 ``` r
 # Dataset summaries
-head(summary_stats)
+sum_stats <- summary_stats()
+head(sum_stats)
 ```
 
     ##                dataset n_instances n_features n_binary_features
     ## 1             1027_ESL         488          4                 0
-    ## 2             1028_SWD        1000         10                 0
+    ## 2             1028_SWD        1000         10                 1
     ## 3             1029_LEV        1000          4                 0
     ## 4             1030_ERA        1000          4                 0
-    ## 5         1089_USCrime          47         13                 0
-    ## 6 1096_FacultySalaries          50          4                 0
+    ## 5         1089_USCrime          47         13                 1
+    ## 6 1096_FacultySalaries          50          4                 1
     ##   n_categorical_features n_continuous_features endpoint_type n_classes
-    ## 1                      0                     4    continuous         9
-    ## 2                      0                    10    continuous         4
-    ## 3                      0                     4    continuous         5
+    ## 1                      4                     0    continuous         9
+    ## 2                      9                     0    continuous         4
+    ## 3                      4                     0    continuous         5
     ## 4                      0                     4    continuous         9
-    ## 5                      0                    13    continuous        42
-    ## 6                      0                     4    continuous        39
+    ## 5                      0                    12    continuous        42
+    ## 6                      0                     3    continuous        39
     ##     imbalance       task
     ## 1 0.099363200 regression
     ## 2 0.108290667 regression
@@ -142,7 +157,7 @@ fewer than 100 observations for a classification task:
 
 ``` r
 library(dplyr)
-summary_stats %>%
+sum_stats %>%
   filter(n_instances < 100, task == "classification") %>%
   pull(dataset)
 ```
diff --git a/data-raw/get-summary.R b/data-raw/get-summary.R
deleted file mode 100644
index e849738..0000000
--- a/data-raw/get-summary.R
+++ /dev/null
@@ -1,12 +0,0 @@
-links_to_stats <- 'https://github.com/EpistasisLab/pmlb/raw/master/pmlb/all_summary_stats.tsv'
-summary_stats <- read.csv(links_to_stats, sep = '\t')
-colnames(summary_stats) <- tolower(gsub('X.', 'n_', colnames(summary_stats)))
-dataset_names <- summary_stats$dataset
-
-regression_dataset_names <- sort(subset(summary_stats, task == 'regression')$dataset)
-classification_dataset_names <- sort(subset(summary_stats, task == 'classification')$dataset)
-
-usethis::use_data(summary_stats, dataset_names,
-                  classification_dataset_names,
-                  regression_dataset_names,
-                  overwrite = TRUE)
diff --git a/data/classification_dataset_names.rda b/data/classification_dataset_names.rda
deleted file mode 100644
index 87951c8..0000000
Binary files a/data/classification_dataset_names.rda and /dev/null differ
diff --git a/data/dataset_names.rda b/data/dataset_names.rda
deleted file mode 100644
index 0f831e9..0000000
Binary files a/data/dataset_names.rda and /dev/null differ
diff --git a/data/regression_dataset_names.rda b/data/regression_dataset_names.rda
deleted file mode 100644
index ab1e042..0000000
Binary files a/data/regression_dataset_names.rda and /dev/null differ
diff --git a/data/summary_stats.rda b/data/summary_stats.rda
deleted file mode 100644
index b8050c6..0000000
Binary files a/data/summary_stats.rda and /dev/null differ
diff --git a/man/classification_dataset_names.Rd b/man/classification_dataset_names.Rd
deleted file mode 100644
index b682290..0000000
--- a/man/classification_dataset_names.Rd
+++ /dev/null
@@ -1,19 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/data.R
-\docType{data}
-\name{classification_dataset_names}
-\alias{classification_dataset_names}
-\title{Names of available classification datasets}
-\format{
-An object of class \code{character} of length 162.
-}
-\source{
-\url{https://github.com/EpistasisLab/pmlb}
-}
-\usage{
-classification_dataset_names
-}
-\description{
-A list of the names of available classification datasets
-}
-\keyword{datasets}
diff --git a/man/classification_datasets.Rd b/man/classification_datasets.Rd
new file mode 100644
index 0000000..6625ca4
--- /dev/null
+++ b/man/classification_datasets.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/list_datasets.R
+\name{classification_datasets}
+\alias{classification_datasets}
+\title{Classification datasets}
+\usage{
+classification_datasets()
+}
+\value{
+A character vector of classification dataset names.
+}
+\description{
+Classification datasets
+}
+\examples{
+if (interactive()) {
+  sample(classification_datasets(), 10)
+}
+}
diff --git a/man/dataset_names.Rd b/man/dataset_names.Rd
index 1692325..1b1f85a 100644
--- a/man/dataset_names.Rd
+++ b/man/dataset_names.Rd
@@ -1,19 +1,19 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/data.R
-\docType{data}
+% Please edit documentation in R/list_datasets.R
 \name{dataset_names}
 \alias{dataset_names}
-\title{Names of all available datasets}
-\format{
-An object of class \code{character} of length 284.
-}
-\source{
-\url{https://github.com/EpistasisLab/pmlb}
-}
+\title{All available datasets}
 \usage{
-dataset_names
+dataset_names()
+}
+\value{
+A character vector of all dataset names.
 }
 \description{
-A list of the names of available datasets
+All available datasets
+}
+\examples{
+if (interactive()) {
+  sample(dataset_names(), 10)
+}
 }
-\keyword{datasets}
diff --git a/man/fetch_data.Rd b/man/fetch_data.Rd
index 9f7da14..cbdc9f5 100644
--- a/man/fetch_data.Rd
+++ b/man/fetch_data.Rd
@@ -40,5 +40,5 @@ if (interactive()){
 
 }
 \seealso{
-\code{\link{summary_stats}}.
+\code{\link{pmlb_metadata}}.
 }
diff --git a/man/nearest_datasets-methods.Rd b/man/nearest_datasets-methods.Rd
index 143e36e..c9d7ea7 100644
--- a/man/nearest_datasets-methods.Rd
+++ b/man/nearest_datasets-methods.Rd
@@ -39,7 +39,7 @@ or data.frame of n_samples x n_features(or n_features+1 with a target column)}
 
 \item{dimensions}{Character vector specifying dataset characteristics to include in similarity calculation.
 Dimensions must correspond to numeric columns of
-[all_summary_stats.tsv](https://github.com/EpistasisLab/pmlb/blob/master/pmlb/all_summary_stats.tsv).
+[all_summary_stats.tsv](https://github.com/EpistasisLab/pmlb/blob/master/pmlb/all_summarystats.tsv).
 If 'all' (default), uses all numeric columns.}
 
 \item{target_name}{Character string specifying column of target/dependent variable.}
diff --git a/man/pmlb.Rd b/man/pmlb.Rd
index be252e6..61d7511 100644
--- a/man/pmlb.Rd
+++ b/man/pmlb.Rd
@@ -17,7 +17,7 @@ This R library includes summaries of the classification and regression data sets
 include any of the PMLB data sets.  The data sets can be downloaded using the \code{\link{fetch_data}} function which
 is similar to the corresponding PMLB python function.
 
-See \code{\link{fetch_data}}, \code{\link{summary_stats}} for usage examples and further information.
+See \code{\link{fetch_data}}, \code{\link{pmlb_metadata}} for usage examples and further information.
 
 If you use PMLB in a scientific publication, please consider citing the following paper:
 
diff --git a/man/pmlb_metadata.Rd b/man/pmlb_metadata.Rd
new file mode 100644
index 0000000..9b6fa96
--- /dev/null
+++ b/man/pmlb_metadata.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/list_datasets.R
+\name{pmlb_metadata}
+\alias{pmlb_metadata}
+\title{Get metadata for all datasets in PMLB.}
+\usage{
+pmlb_metadata()
+}
+\value{
+A list containing summary_stats, dataset_names, classification_datasets, and regression_datasets
+}
+\description{
+Metadata like summary statistics and names of available datasets
+on the PMLB repository.
+}
+\examples{
+if (interactive()) {
+  sample(pmlb_metadata()$dataset_names, 10)
+}
+}
diff --git a/man/regression_dataset_names.Rd b/man/regression_dataset_names.Rd
deleted file mode 100644
index c28c42b..0000000
--- a/man/regression_dataset_names.Rd
+++ /dev/null
@@ -1,19 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/data.R
-\docType{data}
-\name{regression_dataset_names}
-\alias{regression_dataset_names}
-\title{Names of available regression datasets}
-\format{
-An object of class \code{character} of length 122.
-}
-\source{
-\url{https://github.com/EpistasisLab/pmlb}
-}
-\usage{
-regression_dataset_names
-}
-\description{
-A list of the names of available regression datasets
-}
-\keyword{datasets}
diff --git a/man/regression_datasets.Rd b/man/regression_datasets.Rd
new file mode 100644
index 0000000..c7c4abe
--- /dev/null
+++ b/man/regression_datasets.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/list_datasets.R
+\name{regression_datasets}
+\alias{regression_datasets}
+\title{Regression datasets}
+\usage{
+regression_datasets()
+}
+\value{
+A character vector of regression dataset names.
+}
+\description{
+Regression datasets
+}
+\examples{
+if (interactive()) {
+  sample(regression_datasets(), 10)
+}
+}
diff --git a/man/summary_stats.Rd b/man/summary_stats.Rd
index 6f8539f..638f319 100644
--- a/man/summary_stats.Rd
+++ b/man/summary_stats.Rd
@@ -1,31 +1,20 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/data.R
-\docType{data}
+% Please edit documentation in R/list_datasets.R
 \name{summary_stats}
 \alias{summary_stats}
-\title{Summary statistics for the all datasets}
-\format{
-A data frame with 10 variables:
-\describe{
-  \item{dataset:}{Dataset name}
-  \item{n_instances:}{Number of data observations (equal to number of rows)}
-  \item{n_features:}{Total number of features (number of columns - 1)}
-  \item{n_binary_features:}{Number of binary features}
-  \item{n_categorical_features:}{Number of categorical features}
-  \item{n_continuous_features:}{Number of continuous features}
-  \item{n_classes:}{Number of classes in target variable}
-  \item{endpoint_type:}{Value type of endpoint/target (can be binary, categorical or continuous)}
-  \item{imbalance:}{Imbalance metric, where zero means that the dataset is perfectly balanced and the higher the value, the more imbalanced the dataset}
-  \item{task:}{Type of problem/task. Can be classification or regression.}
+\title{Summary statistics}
+\usage{
+summary_stats()
 }
+\value{
+A dataframe of summary statistics of all available datasets,
+including number of instances/rows, number of columns/features, task, etc.
 }
-\source{
-\url{https://github.com/EpistasisLab/pmlb}
+\description{
+Summary statistics
 }
-\usage{
-summary_stats
+\examples{
+if (interactive()) {
+  head(summary_stats())
 }
-\description{
-Summary statistics for the all datasets
 }
-\keyword{datasets}
diff --git a/tests/testthat.R b/tests/testthat.R
new file mode 100644
index 0000000..af3cfcb
--- /dev/null
+++ b/tests/testthat.R
@@ -0,0 +1,12 @@
+# This file is part of the standard setup for testthat.
+# It is recommended that you do not modify it.
+#
+# Where should you do additional test configuration?
+# Learn more about the roles of various files in:
+# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview
+# * https://testthat.r-lib.org/articles/special-files.html
+
+library(testthat)
+library(pmlbr)
+
+test_check("pmlbr")
diff --git a/tests/testthat/test-list_datasets.R b/tests/testthat/test-list_datasets.R
new file mode 100644
index 0000000..6c27db4
--- /dev/null
+++ b/tests/testthat/test-list_datasets.R
@@ -0,0 +1,12 @@
+test_that("pmlb_metadata works as expected", {
+  skip_on_cran()
+  data <- pmlb_metadata()
+  expect_true(is.data.frame(data$summary_stats))
+  expect_true(is.character(data$dataset_names))
+  expect_true(is.character(data$classification_datasets))
+  expect_true(is.character(data$regression_datasets))
+
+  expect_true("penguins" %in% dataset_names())
+  expect_true("penguins" %in% classification_datasets())
+  expect_true("1089_USCrime" %in% regression_datasets())
+})
diff --git a/tests/testthat/test-nearest_datasets.R b/tests/testthat/test-nearest_datasets.R
new file mode 100644
index 0000000..41a2bd5
--- /dev/null
+++ b/tests/testthat/test-nearest_datasets.R
@@ -0,0 +1,16 @@
+test_that("nearest dataset is itself", {
+  skip_on_cran()
+  expect_equal(
+    nearest_datasets("penguins")[[1]],
+    "penguins"
+  )
+  expect_equal(
+    nearest_datasets(fetch_data("lupus"))[[1]],
+    "lupus"
+  )
+
+  expect_equal(
+    nearest_datasets("1089_USCrime")[[1]],
+    "1089_USCrime"
+  )
+})