diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..1c10163 --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,31 @@ +Package: pamm.xgb +Title: Advanced survival analysis using xgboost +Version: 0.1.0 +Date: 2020-01-07 +Authors@R: person("Andreas", "Bender", , "andreas.bender@stat.uni-muenchen.de", role = c("aut", "cre"), comment=c(ORCID = "0000-0001-5628-8611")) +Description: Estimate piece-wise exponential models using gradient based algorithms. +Depends: R (>= 3.6.0) +Imports: + caret, + checkmate, + rlang, + purrr, + dplyr, + tidyr, + pammtools, + xgboost, + pec, + prodlim, + Formula, + survival +Suggests: + knitr, + rmarkdown, + testthat +Remotes: + adibender/pammtools +License: MIT + file LICENSE +Encoding: UTF-8 +LazyData: true +RoxygenNote: 7.0.2 +VignetteBuilder: knitr diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0729b78 --- /dev/null +++ b/LICENSE @@ -0,0 +1,2 @@ +YEAR: 2019 +COPYRIGHT HOLDER: Andreas Bender diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..f642843 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +# MIT License + +Copyright (c) 2019 Andreas Bender + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..06d8ef1 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,57 @@ +# Generated by roxygen2: do not edit by hand + +S3method(as.data.frame,crps) +S3method(as_ped,pam_xgb) +S3method(predict,pam_xgb) +S3method(predictEventProb,pam_xgb) +S3method(predictSurvProb,pam_xgb) +export(add_cumu_hazard2) +export(add_hazard2) +export(add_surv_prob2) +export(as_xgb_data) +export(get_cv_indices) +export(get_ibs) +export(get_split_indices) +export(random_params) +export(rf.cv_pam) +export(rf.tune_pam) +export(sim_pexp_cr) +export(xgb.cv_pam) +export(xgb.train.ped) +export(xgb.tune_pam) +export(xgboost.ped) +import(checkmate) +import(dplyr) +importFrom(Formula,Formula) +importFrom(caret,createFolds) +importFrom(dplyr,bind_cols) +importFrom(pammtools,as_ped) +importFrom(pammtools,get_intervals) +importFrom(pammtools,is.ped) +importFrom(pammtools,split_data) +importFrom(pec,crps) +importFrom(pec,pec) +importFrom(pec,predictEventProb) +importFrom(pec,predictSurvProb) +importFrom(prodlim,Hist) +importFrom(purrr,flatten_chr) +importFrom(purrr,map) +importFrom(purrr,map_dfr) +importFrom(purrr,map_lgl) +importFrom(purrr,reduce) +importFrom(rlang,.data) +importFrom(rlang,.env) +importFrom(rlang,UQ) +importFrom(rlang,is_atomic) +importFrom(rlang,quo_name) +importFrom(rlang,sym) +importFrom(stats,model.matrix) +importFrom(stats,predict) +importFrom(stats,quantile) +importFrom(survival,Surv) +importFrom(tidyr,fill) +importFrom(tidyr,pivot_longer) +importFrom(xgboost,setinfo) +importFrom(xgboost,xgb.DMatrix) +importFrom(xgboost,xgb.train) +importFrom(xgboost,xgboost) diff --git a/R/add-functions.R b/R/add-functions.R new file mode 100644 index 0000000..031bbdf --- /dev/null +++ b/R/add-functions.R @@ -0,0 +1,321 @@ +#' Add predicted (cumulative) hazard to data set +#' +#' Add (cumulative) hazard based on the provided data set and model. +#' +#' +#' @rdname add_hazard2 +#' @inheritParams pammtools::add_hazard +#' @inheritParams xgboost::xgboost +#' @param ... Further arguments passed to \code{\link[xgboost]{predict.xgb.Booster}}. +#' @param overwrite Should hazard columns be overwritten if already present in +#' the data set? Defaults to \code{FALSE}. +#' @import checkmate dplyr +#' @importFrom stats predict +#' @export +add_hazard2 <- function( + newdata, + object, + reference = NULL, + overwrite = FALSE, + time_var = NULL, + ...) { + + if (!overwrite) { + if ("hazard" %in% names(newdata)) { + stop("Data set already contains 'hazard' column. + Set `overwrite=TRUE` to overwrite") + } + } else { + rm.vars <- intersect( + c("hazard", "se", "ci_lower", "ci_upper"), + names(newdata)) + newdata <- newdata %>% select(-one_of(rm.vars)) + } + + + get_hazard2(object, newdata, reference = reference, ...) + +} + + +get_hazard2 <- function(object, newdata, reference = NULL, ...) { + UseMethod("get_hazard2", object) +} + +get_hazard2.xgb.Booster <- function( + object, + newdata, + reference = NULL, + time_var = NULL, + ...) { + + assert_data_frame(newdata, all.missing = FALSE) + + if (is.null(time_var)) { + time_var <- "tend" + } else { + assert_string(time_var) + assert_choice(time_var, colnames(newdata)) + } + + features <- unique(object$orig_features) + X <- as_xgb_data(newdata, features = features, label = "ped_status", + set_margin = FALSE) + hazard_name <- paste0("hazard_", class(object)[1]) + newdata[[hazard_name]] <- xgboost:::predict.xgb.Booster(object, X) + + newdata + +} + +preproc_reference <- function(reference, cnames, n_rows) { + + # check that provided variables contained in newdata + names_ref <- names(reference) + if (!check_subset(names_ref, cnames)) { + stop(paste0("Columns in 'reference' but not in 'newdata':", + paste0(setdiff(names_ref, cnames), collapse = ","))) + } + # transform to list if inherits from data frame, so it can be processed + # in mutate via !!! + if (inherits(reference, "data.frame")) { + if (!(nrow(reference) == n_rows | nrow(reference) == 1)) { + stop("If reference is provided as data frame, number of rows must be + either 1 or the number of rows in newdata.") + } + reference <- as.list(reference) + } + + reference + +} + + + +#' @rdname add_hazard2 +#' @inheritParams add_hazard2 +#' @param interval_length The variable in newdata containing the interval lengths. +#' Can be either bare unquoted variable name or character. Defaults to \code{"intlen"}. +#' @importFrom dplyr bind_cols +#' @export +add_cumu_hazard2 <- function( + newdata, + object, + overwrite = FALSE, + time_var = NULL, + interval_length = "intlen", + ...) { + + interval_length <- quo_name(enquo(interval_length)) + + if (!overwrite) { + if ("cumu_hazard" %in% names(newdata)) { + stop( + "Data set already contains 'hazard' column. + Set `overwrite=TRUE` to overwrite") + } + } else { + rm.vars <- intersect(c("cumu_hazard", "cumu_lower", "cumu_upper"), + names(newdata)) + newdata <- newdata %>% select(-one_of(rm.vars)) + } + + get_cumu_hazard2(object, newdata, time_var = time_var, + interval_length = interval_length, ...) + +} + +#' Calculate cumulative hazard +#' +#' @inheritParams add_cumu_hazard2 +#' @import checkmate dplyr +#' @importFrom rlang UQ sym quo_name .data .env +#' @importFrom purrr map_lgl +#' @keywords internal +get_cumu_hazard2 <- function( + object, + newdata, + time_var = NULL, + interval_length = "intlen", + ...) { + + assert_character(interval_length) + assert_subset(interval_length, colnames(newdata)) + assert_data_frame(newdata, all.missing = FALSE) + + interval_length <- sym(interval_length) + hazard_name <- paste0("hazard_", class(object)[1]) + cumu_name <- paste0("cumu_hazard_", class(object)[1]) + + mutate_args <- list(cumu_hazard = quo(cumsum(.data[[hazard_name]] * + (!!interval_length)))) + haz_vars_in_data <- map(c(hazard_name, "se"), + ~ grep(.x, colnames(newdata), value = TRUE, fixed = TRUE)) %>% flatten_chr() + vars_exclude <- c(hazard_name) + + newdata <- get_hazard2(object, newdata, time_var = time_var, ...) + + newdata <- newdata %>% + mutate(!!!mutate_args) + newdata <- newdata %>% rename({{cumu_name}} := "cumu_hazard") + + vars_exclude <- setdiff(vars_exclude, haz_vars_in_data) + if (length(vars_exclude) != 0 ) { + newdata <- newdata %>% select(-one_of(vars_exclude)) + } + + newdata + +} + + +#' Add survival probability estimates +#' +#' Given suitable data (i.e. data with all columns used for estimation of the model), +#' this functions adds a column \code{surv_prob} containing survival probabilities +#' for the specified covariate and follow-up information. +#' +#' @inherit add_cumu_hazard2 +#' @export +add_surv_prob2 <- function( + newdata, + object, + overwrite = FALSE, + time_var = NULL, + interval_length = "intlen", + ...) { + + interval_length <- quo_name(enquo(interval_length)) + + if (!overwrite) { + if ("surv_prob" %in% names(newdata)) { + stop("Data set already contains 'surv_prob' column. + Set `overwrite=TRUE` to overwrite") + } + } else { + rm.vars <- intersect(c("surv_prob"), names(newdata)) + newdata <- newdata %>% select(-one_of(rm.vars)) + } + + get_surv_prob2(object, newdata, + time_var = time_var, interval_length = interval_length, ...) + +} + + +#' Calculate survival probabilities +#' +#' @inheritParams add_surv_prob2 +#' @importFrom purrr flatten_chr +#' @keywords internal +get_surv_prob2 <- function( + object, + newdata, + time_var = NULL, + interval_length = "intlen", + ...) { + + assert_character(interval_length) + assert_subset(interval_length, colnames(newdata)) + assert_data_frame(newdata, all.missing = FALSE) + + interval_length <- sym(interval_length) + hazard_name <- paste0("hazard_", class(object)[1]) + surv_name <- paste0("surv_prob_", class(object)[1]) + + mutate_args <- list(surv_prob = quo(exp(-cumsum(.data[[hazard_name]] * + (!!interval_length))))) + haz_vars_in_data <- map(c(hazard_name), + ~grep(.x, colnames(newdata), value = TRUE, fixed = TRUE)) %>% flatten_chr() + vars_exclude <- c(hazard_name) + + newdata <- get_hazard2( + object = object, + newdata, + time_var = time_var) + + newdata <- newdata %>% + mutate(!!!mutate_args) + newdata <- newdata %>% rename({{surv_name}} := "surv_prob") + + vars_exclude <- setdiff(vars_exclude, haz_vars_in_data) + if (length(vars_exclude) != 0 ) { + newdata <- newdata %>% select(-one_of(vars_exclude)) + } + + newdata + +} + + +# add_cif <- function() { +# newdata, +# object, +# overwrite = FALSE, +# time_var = NULL, +# interval_length = "intlen", +# ...) { + +# interval_length <- quo_name(enquo(interval_length)) + +# if (!overwrite) { +# if ("surv_prob" %in% names(newdata)) { +# stop("Data set already contains 'surv_prob' column. +# Set `overwrite=TRUE` to overwrite") +# } +# } else { +# rm.vars <- intersect(c("surv_prob"), names(newdata)) +# newdata <- newdata %>% select(-one_of(rm.vars)) +# } + +# get_surv_prob2(object, newdata, +# time_var = time_var, interval_length = interval_length, ...) + +# } + +# #' Calculate all-cause survival probabilities +# #' +# #' @inheritParams add_surv_prob2 +# #' @importFrom purrr flatten_chr +# #' @keywords internal +# get_all_cause_surv_prob <- function( +# object, +# newdata, +# time_var = NULL, +# interval_length = "intlen", +# status_var = "type", +# ...) { + +# assert_character(interval_length) +# assert_subset(interval_length, colnames(newdata)) +# assert_data_frame(newdata, all.missing = FALSE) + + +# interval_length <- sym(interval_length) +# hazard_name <- paste0("hazard_", class(object)[1]) +# hazard_names <- paste0(hazard_name, seq_len(unique(newdata[[status_var]]))) +# surv_name <- paste0("surv_prob_allcause", class(object)[1]) + +# mutate_args <- list(surv_prob = quo(exp(-cumsum(.data[[hazard_name]] * +# (!!interval_length))))) +# haz_vars_in_data <- map(c(hazard_name), +# ~grep(.x, colnames(newdata), value = TRUE, fixed = TRUE)) %>% flatten_chr() +# vars_exclude <- c(hazard_name) + +# newdata1 <- get_hazard2( +# object = object, +# dplyr::mutate(newdata$status = status_vals[1], +# time_var = time_var) + +# newdata <- newdata %>% +# mutate(!!!mutate_args) +# newdata <- newdata %>% rename({{surv_name}} := "surv_prob") + +# vars_exclude <- setdiff(vars_exclude, haz_vars_in_data) +# if (length(vars_exclude) != 0 ) { +# newdata <- newdata %>% select(-one_of(vars_exclude)) +# } + +# newdata + +# } diff --git a/R/cv-splits.R b/R/cv-splits.R new file mode 100644 index 0000000..e3a59e1 --- /dev/null +++ b/R/cv-splits.R @@ -0,0 +1,55 @@ +#' Create a lst of train/test indices +#' +#' @param n number of rows in data set. +#' @param nfold number of folds for cross-validation. +#' @importFrom purrr map +#' @importFrom caret createFolds +#' +#' @export +#' @keywords internal +get_cv_indices <- function(y, nfold = 4) { + + ## set up CV + ind_test <- createFolds(y, k = nfold) + ind_ret <- map( + ind_test, + ~ list( + ind_train = setdiff(seq_along(y), .x), + ind_test = .x + ) + ) + +} + +#' Split data into training and eval data for watchlist +#' +#' @export +#' @keywords internal +get_split_indices <- function(n, split_frac = .7) { + + n_train <- round(n * split_frac, 0) + train_idx <- sample(seq_len(n), n_train) + eval_idx <- setdiff(seq_len(n), train_idx) + + list(ind_train = train_idx, ind_eval = eval_idx) + +} + +#' Draw random parameters from prespecified parameter space +#' +#' Checks if integer or double. Samples integers or draws from uniform distribution +#' respectively. +#' @export +random_params <- function(param_space) { + + purrr::map( + param_space, + ~ { + if(is.integer(.x[1])) { + sample(seq(.x[1], .x[2], by = 1L), 1) + } else { + runif(1, .x[1], .x[2]) + } + } + ) +} diff --git a/R/data-trafo-helpers.R b/R/data-trafo-helpers.R new file mode 100644 index 0000000..ae76940 --- /dev/null +++ b/R/data-trafo-helpers.R @@ -0,0 +1,97 @@ +#' Transform data suitable for xgboost applications +#' @export +as_xgb_data <- function(x, ...) { + + UseMethod("as_xgb_data", x) + +} + + +#' @rdname as_xgb_data +#' @importFrom stats model.matrix +#' @inherit as_xgb_data +#TODO: remove hard coded vars (e.g. offset) +as_xgb_data.data.frame <- function(x, features, label, set_margin = TRUE, ...) { + + mm <- model.matrix(~ ., x[, features, drop = FALSE])[, -1] + xgb_data <- xgb.DMatrix( + data = mm, + label = as.numeric(x[[label]]) + ) + if(set_margin) { + setinfo(xgb_data, "base_margin", x[["offset"]]) + } + + xgb_data + +} + +#' @rdname as_xgb_data +#' @inherit as_xgb_data +# TODO: remove hard coded vars (should be stored in ped object) +as_xgb_data.ped <- function(x, ...) { + + omit_vars <- attributes(x)[c("id_var", "intvars")] %>% unlist() + keep_vars <- unique(c("tend", setdiff(colnames(x), omit_vars))) + + as_xgb_data(as.data.frame(x), features = keep_vars, label = "ped_status") + +} + +#' Transform new data to xgb-PED format using same arguments as previous transformation +#' +#' @keywords internal +as_xgb_newdata <- function(x, newdata, ...) { + UseMethod("as_xgb_newdata", x) +} + +#' @inherit as_xgb_newdata +#' @keywords internal +as_xgb_newdata.ped <- function(x, newdata, ...) { + + ped <- as_ped(x, newdata) + vars <- setdiff(attr(x, "names"), attr(x, "intvars")) + + as_xgb_data(ped, features = vars, label = "ped_status") + + +} +#' @inherit as_xgb_newdata +#' @keywords internal +as_xgb_newdata.pam_xgb <- function(x, newdata, ...) { + + ped_newdata <- as_ped(x, newdata) + + as_xgb_data(ped_newdata) + +} + +#' Transform newdata to PED format based on fitted xgb model +#' +#' @rdname as_ped +#' @importFrom pammtools as_ped split_data is.ped +#' @export +as_ped.pam_xgb <- function(data, newdata, ...) { + + if (is.ped(newdata)) { + stop("newdata already in ped format.") + } + + trafo_args <- data[["trafo_args"]] + trafo_args$data <- newdata + do.call(split_data, trafo_args) + +} + +#' @rdname as_ped +as_ped.ped <- function(data, newdata, ...) { + + if (pammtools::is.ped(newdata)) { + stop("newdata already in ped format.") + } + + trafo_args <- attr(data, "trafo_args") + trafo_args[["data"]] <- newdata + do.call(pammtools::as_ped, trafo_args) + +} diff --git a/R/pec-helpers.R b/R/pec-helpers.R new file mode 100644 index 0000000..ad2ecc7 --- /dev/null +++ b/R/pec-helpers.R @@ -0,0 +1,147 @@ +#' Extract IBS in tidy format +#' +#' @inheritParams pec::pec +#' @inheritParams pec::crps +#' @importFrom stats quantile +#' @param keep_reference Logical. Should Brier Score of Reference model be kept +#' in results table. +#' @export +get_ibs <- function( + object, + data, + pec_params = list( + formula = Surv(time, status) ~ 1, + exact = FALSE, + start = .01), + keep_reference = TRUE, + q_last = .8, + q_eval = c(.25, .5, .75), + ...) { + + if (class(data) == "list") { + status_values <- sort(unique(data[[1]]$status)) + } else { + status_values <- sort(unique(data$status)) + } + + cause_values <- status_values[status_values != 0] + + res <- map_dfr( + cause_values, + ~ { + times <- times_list(data, q_last = q_last, q_eval = q_eval, + status_value = .x) + time_seq <- seq(.01, times$t_last, length.out = 500L) + if (class(data) == "list") { + if(class(object)[1] == "list") { + pred <- predictSurvProb(object[[1]], data, times = time_seq) + } else { + pred <- predictSurvProb(object, data, times = time_seq) + } + } + pec_params$object <- if (class(data) == "list") {list("pam_xgb" = pred)} else {object} + pec_params$data <- if (class(data) == "list") { data[[1]]} else{ data } + pec_params$times <- time_seq + pec_params$cause <- .x + + if(length(cause_values) > 1) { + pec_params$formula <- update(pec_params$formula, Hist(time, status)~.) + } + + pec <- do.call(pec::pec, pec_params) + + ibs <- pec::crps(pec, times = times$eval_times) %>% + as.data.frame() %>% + group_by(method) %>% + mutate( + time = times$eval_times, + quantile = q_eval) %>% + ungroup() %>% + mutate(cause = .x) + + ibs + + } + ) + + if (!keep_reference) { + res <- res %>% + dplyr::filter(method != "Reference") + } + + res + +} + + +#' Look up Quantiles of event times in data +#' +#' @keywords internal +times_list <- function(data, q_last = .8, q_eval = c(.25, .5, .75), + time_var = "time", status_var = "status", status_value = 1) { + + if (class(data) == "list") { + data <- data[[1]] + } + + t_last <- quantile(data[[time_var]][data[[status_var]] == status_value], q_last) + eval_times <- quantile(data[[time_var]][data[[status_var]] == status_value], q_eval) + + list(t_last = t_last, eval_times = eval_times) + +} + + +#' C-Index from pec package +#' +get_cindex <- function( + object, + data, + cind_params = list( + formula = Surv(time, status) ~ 1, + exact = FALSE, + start = .01), + eval_times = NULL, + q_last = .8, + q_eval = c(.25, .5, .75)) { + + if(is.null(eval_times)) { + times <- times_list(data, q_last = q_last, q_eval = q_eval) + eval_times <- times$eval_times + } + + + cind_params$object <- object + cind_params$data <- data + cind_params$eval.times <- eval_times + cind <- do.call(pec::cindex, cind_params) + + cind$AppCindex %>% as.data.frame() %>% + mutate(eval_time = eval_times) %>% + tidyr::gather("method", "cindex", -eval_time) + +} + + +#' Transform crps object to data.frame +#' +#' A\code{as.data.frame} S3 method for objects of class \code{\link[pec]{crps}}. +#' +#' @inheritParams base::as.data.frame +#' @param x An object of class \code{crps}. See \code{\link[pec]{crps}}. +#' @importFrom tidyr pivot_longer +#' +#' @export +as.data.frame.crps <- function(x, row.names = NULL, optional = FALSE, ...) { + + m <- matrix(x, nrow = dim(x)[1], ncol = dim(x)[2]) + colnames(m) <- attr(x, "dimnames")[[2]] + + m <- as.data.frame(m) + m$method <- attr(x, "dimnames")[[1]] + + m <- m %>% + pivot_longer(cols = -.data$method, values_to = "IBS") %>% + dplyr::rename(time = .data$name) + +} diff --git a/R/predict.R b/R/predict.R new file mode 100644 index 0000000..8155249 --- /dev/null +++ b/R/predict.R @@ -0,0 +1,336 @@ +#' Predict hazard, cumulative hazard or survival probability +#' +#' @param object An object of class \code{pam_xgb} +#' @param newdata A data set containing the same covariates as used for model +#' fitting. If of class \code{data.frame}, the function will try to transform +#' to the \code{xgb.DMatrix} format. +#' @param times A vector of times for which predictions should be generated +#' @param type The type of prediction desired. Either hazard (\code{type = "hazard"}), +#' cumulative hazard (\code{type = "cumu_hazard"}) or survival probability +#' (\code{type = "surv_prob"}). +#' @importFrom stats predict +#' @importFrom purrr map_dfr +#' @return A matrix of predictions containing one row per +#' observation (row in newdata) and 1 column per specified time in the +#' \code{times} argument. +#' @export +predict.pam_xgb <- function( + object, + newdata, + type = c("hazard", "cumu_hazard", "surv_prob"), + ...) { + + ## TODO: this function actually doesn't work if data is xgb.DMatrix + type <- match.arg(type) + brks <- attr(object, "attr_ped")$breaks + # attr(object, "attr_ped") <- c(attr(object, "attr_ped"), status_var = "status") + + ## TODO: catch case where newdata either xgb.DMatrix or data.frame in ped format + ped_newdata <- as_ped(object, newdata) + vars <- setdiff( + attr(ped_newdata, "names"), + attr(ped_newdata, "intvars")) + + xgb_newdata <- as_xgb_data(ped_newdata) + + ped_newdata[["pred"]] <- xgboost:::predict.xgb.Booster(object, xgb_newdata) + if (type == "cumu_hazard") { + ped_newdata <- ped_newdata %>% + group_by(.data$id) %>% + mutate(pred = cumsum(.data$pred * exp(.data$offset)))#TODO: is it correct to use offset here? + } + if (type == "surv_prob") { + ped_newdata <- ped_newdata %>% + group_by(.data[["id"]]) %>% + mutate(pred = exp(-cumsum(.data$pred * exp(.data$offset)))) + } + + ped_newdata %>% + group_by(.data[["id"]]) %>% + filter(row_number() == n()) %>% + pull(.data[["pred"]]) # TODO: is the hazard/surv prob in the last available interval a useful return? + +} + +# check for time-dependent covariates. +#TODO seems like this should just be a flag in attr_ped? +has_tdc <- function(model) { + any(c("ccr", "func") %in% names(attributes(model)[["attr_ped"]])) +} + +get_new_ped <- function(object, newdata, times, attr_ped) { + ## TODO: create ped_info without creating a ped_newdata object first + # extract vars used in model fit + covars <- setdiff(attr_ped[["names"]], attr_ped[["intvars"]]) + if ("tend" %in% object$feature_names) { + vars <- c("tend", covars) + } + id_var <- attr_ped[["id_var"]] + brks <- attr_ped[["breaks"]] + ped_times <- sort(unique(union(c(0, brks), times))) + # extract relevant intervals only, keeps data small + ped_times <- ped_times[ped_times <= max(times)] + # obtain interval information + ped_info <- get_intervals(brks, ped_times[-1]) + # add adjusted offset such that cumulative hazard and survival probability + # can be calculated correctly + ped_info[["offset"]] <- c(ped_info[["times"]][1], diff(ped_info[["times"]])) + + # create data set with interval/time + covariate info + newdata[[id_var]] <- seq_len(nrow(newdata)) #TODO: potentially overwriting it here seems dangerous? + new_ped <- pammtools::combine_df(ped_info, newdata[, c(id_var, covars)]) + new_ped$ped_status <- 1 + new_ped +} + +get_all_intervals <- function( + x, + times, + left.open = TRUE, + rightmost.closed = TRUE, + ...) { + + # check inputs + assert_numeric(times, lower = 0, finite = TRUE, all.missing = FALSE) + + int_df <- pammtools::int_info(x) + int <- findInterval( + x = times, + vec = sort(union(int_df$tstart, int_df$tend)), + left.open = left.open, + rightmost.closed = rightmost.closed) + + #int_df %>% + # slice(int) %>% + # Q: slice(int) will drop duplicated entries -- this is not what we want here, I think? + # should this be changed in pammtools::get_intervals.default as well? + int_df[int,] %>% + mutate(times = times) %>% + # arrange(times) %>% + select(times, everything()) + +} + +#' @importFrom tidyr fill +get_new_ped_tdc <- function(object, newdata, times, attr_ped) { + ## TODO: create ped_info without creating a ped_newdata object first + # extract vars used in model fit + covars <- setdiff(attr_ped[["names"]], attr_ped[["intvars"]]) + id_var <- attr_ped[["id_var"]] + brks <- attr_ped[["breaks"]] + # stopifnot(all(times <= max(brks))) + + #get time info for predictive TDCs + ccr_tz_vars <- purrr::map_chr(attr_ped[["ccr"]][["ccr_list"]], + ~.x[["tz_var"]]) %>% unique() + # avoid assumption that newdata[[2]] is a single data.frame with TDCs on + # the same time grid. could also be a list of data.frames (?) + if (!is.data.frame(newdata[[2]])) { + # TODO: rename all ccr_tz_vars to "times", + # do a full join over all TDCs, + # then fill up NAs by LCVF + stop("multiple time scales for TDCs not implemented yet.") + } + #drop ids for which no time constant info is available: + newdata[[2]] <- filter(newdata[[2]], + !!sym(id_var) %in% newdata[[1]][[id_var]]) + + tdc_times <- select(newdata[[2]], any_of(ccr_tz_vars)) %>% + pull(1) + + ped_times <- sort(unique(c(0, brks, tdc_times, times))) + # extract relevant intervals only, keeps data small + ped_times <- ped_times[ped_times <= max(times)] + # obtain interval information + ped_info <- get_all_intervals(brks, ped_times[-1]) + # add adjusted offset such that cumulative hazard and survival probability + # can be calculated correctly + ped_info[["offset"]] <- c(ped_info[["times"]][1], diff(ped_info[["times"]])) + + # create data set with interval/time + time constant covariate info + ccr_covars <- map(attr_ped[["ccr"]][["ccr_list"]], ~.x[["col_vars"]]) %>% + unlist() %>% unique() + const_covars <- setdiff(covars, ccr_covars) + new_ped_const <- pammtools::combine_df(ped_info, + newdata[[1]][, c(id_var, const_covars)]) + new_ped_const$ped_status <- 1 + + # create data set with time dependent covariates (last value carried forward) + suppressMessages({ + new_ped_tdc <- rename(newdata[[2]], times = sym(ccr_tz_vars)) %>% + full_join(select(new_ped_const, all_of(c(id_var, "times")))) %>% + filter(.data$times <= max(ped_times)) %>% + arrange(!!sym(id_var), !!sym("times")) %>% + tidyr::fill(all_of(ccr_covars)) %>% + filter(.data$times != min(ped_times)) #drop rows for "time origin" + + inner_join(new_ped_const, new_ped_tdc) + + }) + +} +#' S3 method for compatibility with package pec +#' +#' @importFrom pec predictSurvProb +#' @importFrom purrr map +#' @importFrom pammtools get_intervals +#' @export +predictSurvProb.pam_xgb <- function( + object, + newdata, + times) { + + attr_ped <- attributes(object)[["attr_ped"]] + covars <- setdiff(attr_ped[["names"]], attr_ped[["intvars"]]) + if ("tend" %in% object$feature_names) { + vars <- c("tend", covars) + } + id_var <- attr_ped[["id_var"]] + + if (has_tdc(object)) { + new_ped <- get_new_ped_tdc(object, newdata, times, attr_ped) + } else { + new_ped <- get_new_ped(object, newdata, times, attr_ped) + } + + new_ped[["pred"]] <- xgboost:::predict.xgb.Booster( + object, + as_xgb_data(new_ped, vars, label = "ped_status", set_margin = FALSE))## Note label not needed, set any + new_ped <- new_ped %>% + arrange(.data$id, .data$times) %>% + group_by(.data$id) %>% + mutate(pred = exp(-cumsum(.data$pred * .data$offset))) %>% + ungroup() %>% + filter(.data[["times"]] %in% .env[["times"]]) + + id <- unique(new_ped[[id_var]]) + pred_list <- map( + id, + ~ new_ped[new_ped[[id_var]] == .x, "pred"] %>% pull("pred")) + + do.call(rbind, pred_list) + +} +#' S3 method for compatibility with package pec +#' +#' This function is needed to use \code{pec::pec} in the competing risks setting. +#' +#' @importFrom pec predictEventProb +#' @importFrom purrr map +#' @importFrom pammtools get_intervals +#' @export +predictEventProb.pam_xgb <- function( + object, + newdata, + times, + cause, + ... +) { + + attr_ped <- attributes(object)[["attr_ped"]] + id_var <- attr_ped[["id_var"]] + brks <- attr_ped[["breaks"]] + ped_times <- sort(unique(union(c(0, brks), times))) + ped_times <- ped_times[ped_times <= max(times)] + ped_info <- get_intervals(brks, ped_times[-1]) + ped_info[["offset"]] <- c(ped_info[["times"]][1], diff(ped_info[["times"]])) + covars <- setdiff(attr_ped[["names"]], attr_ped[["intvars"]]) + if ("tend" %in% object$feature_names) { + vars <- c("tend", covars) + } + newdata[["type"]] <- 1 + newdata[[id_var]] <- seq_len(nrow(newdata)) + new_ped <- pammtools::combine_df(ped_info, newdata[, c(id_var, covars)]) + new_ped$ped_status <- 1 # irrelevant + new_ped$type <- 1 + new_ped[["csh1"]] <- xgboost:::predict.xgb.Booster(object, + as_xgb_data(new_ped, vars, label = "ped_status", set_margin = FALSE)) + new_ped$type <- 2 # predict hazard for second cause + new_ped[["csh2"]] <- xgboost:::predict.xgb.Booster(object, + as_xgb_data(new_ped, vars, label = "ped_status", set_margin = FALSE)) + new_ped <- new_ped %>% + arrange(.data$id, .data$times) %>% + group_by(.data$id) %>% + mutate(sp_all_cause = exp(-( + cumsum(.data$csh1 * .data$offset) + + cumsum(.data$csh2 * .data$offset)))) %>% + mutate(cif1 = cumsum(.data$csh1 * lag(.data$sp_all_cause, default = 1) * .data$offset)) %>% + mutate(cif2 = cumsum(.data$csh2 * lag(.data$sp_all_cause, default = 1) * .data$offset)) %>% + ungroup() %>% + filter(.data[["times"]] %in% .env[["times"]]) + id <- unique(new_ped[[id_var]]) + pred_list <- map(id, ~new_ped[new_ped[[id_var]] == .x, + paste0("cif", cause)] %>% + pull(paste0("cif", cause))) + do.call(rbind, pred_list) + +} +# #' S3 method for pamm objects for compatibility with package pec +# #' +# #' @inheritParams pec::predictSurvProb +# #' @importFrom pec predictSurvProb +# #' @importFrom purrr map +# #' +# #' @export +# predictEventProb.pamm <- function( +# object, +# newdata, +# times, +# cause, +# ...) { + +# if (!is.ped(newdata)) { + +# trafo_args <- object[["attr_ped"]] +# id_var <- trafo_args[["id_var"]] +# brks <- trafo_args[["breaks"]] +# if ( max(times) > max(brks) ) { +# stop("Can not predict beyond the last time point used during model estimation. +# Check the 'times' argument.") +# } +# ped_times <- sort(unique(union(c(0, brks), times))) +# # extract relevant intervals only, keeps data small +# ped_times <- ped_times[ped_times <= max(times)] +# # obtain interval information +# ped_info <- get_intervals(brks, ped_times[-1]) +# # add adjusted offset such that cumulative hazard and survival probability +# # can be calculated correctly +# ped_info[["intlen"]] <- c(ped_info[["times"]][1], diff(ped_info[["times"]])) +# # create data set with interval/time + covariate info +# newdata[[id_var]] <- seq_len(nrow(newdata)) +# newdata <- combine_df(ped_info, newdata) + +# } + +# env_times <- times +# newdata$type2 <- factor(1, levels=c("1", "2")) +# newdata[["csh1"]] <- predict( +# pammtools:::unpam(object), +# newdata = newdata, +# type = "response") +# newdata$type2 <- factor(2, levels= c("1", "2")) +# newdata[["csh2"]] <- predict( +# pammtools:::unpam(object), +# newdata = newdata, +# type = "response") +# newdata$type2 <- factor(cause, levels = c("1", "2")) +# newdata <- newdata %>% +# arrange(.data$id, .data$times) %>% +# group_by(.data$id) %>% +# mutate(sp_all_cause = exp(-( +# cumsum(.data$csh1 * .data$intlen) + +# cumsum(.data$csh2 * .data$intlen)))) %>% +# mutate(cif1 = cumsum(csh1 * lag(sp_all_cause, default = 1) * .data$intlen)) %>% +# mutate(cif2 = cumsum(csh2 * lag(sp_all_cause, default = 1) * .data$intlen)) %>% +# ungroup() %>% +# filter(.data[["times"]] %in% .env[["times"]]) + +# id <- unique(newdata[[id_var]]) +# pred_list <- map( +# id, +# ~ newdata[newdata[[id_var]] == .x, +# paste0("cif", cause)] %>% pull(paste0("cif", cause))) + +# do.call(rbind, pred_list) + +# } diff --git a/R/sim-cr.R b/R/sim-cr.R new file mode 100644 index 0000000..c6dbc0c --- /dev/null +++ b/R/sim-cr.R @@ -0,0 +1,104 @@ +#' Simulate competing risks time-to-event data via piece-wise exponential distribution +#' +#' Piece-wise exponential implementation of simulation algorithm described in +#' Beyersmann et al. 2009 (). +#' +#' @inheritParams pammtools::sim_pexp +#' @importFrom Formula Formula +#' @importFrom rlang is_atomic +#' @importFrom purrr reduce +#' @examples +#' library(pammtools) +#' library(dplyr) +#' library(survival) +#' # Example from Competing Risks book for comparison +#' simul.dat.cp <- function(n, h01, h02, cens.param) { +#' times <- rexp(n, h01 + h02) +#' ev <- rbinom(n, size = 1, prob = h01 / (h01 + h02)) +#' ev <- ifelse(ev == 0, 2, 1) +#' cens.time <- runif(n, cens.param[1], cens.param[2]) +#' obs.time <- pmin(times, cens.time) +#' obs.cause <- as.numeric(times == obs.time) * ev +#' data.frame(obs.time, obs.cause) +#' } +#' set.seed(923) +#' n <- 1000 +#' dat.exo7 <- simul.dat.cp(n, 0.5, 0.9, c(0.5, 3)) +#' # We compute the Nelson-Aalen estimates using survfit() +#' # That’s just to get the number of events and the risk set +#' temp01 <- survfit(Surv(obs.time, obs.cause == 1) ~ 1, dat.exo7) +#' temp02 <- survfit(Surv(obs.time, obs.cause == 2) ~ 1, dat.exo7) +#' na01 <- cumsum(temp01$n.event / temp01$n.risk) +#' na02 <- cumsum(temp02$n.event / temp02$n.risk) +#' +#' plot(temp01$time, na02, type="s", ylab="Cumulative transition hazard", xlab="time") +#' lines(temp01$time, na01, type="s", col=2) +#' +#' # create data set with variables which will affect the hazard rate +#' # (not used here, but usually more complex examples of interest) +#' df <- cbind.data.frame(x1 = runif (n, -3, 3), x2 = runif (n, 0, 6)) %>% +#' as_tibble() +#' set.seed(24032018) +#' df <- cbind.data.frame( +#' x1 = runif (n, -3, 3), +#' x2 = runif (n, 0, 6)) +#' # two component formula specifying cause specific hazards +#' form <- ~ log(0.5)| log(0.9) +#' sim_df <- sim_pexp_cr(form, df, seq(0, 3, by =.25)) %>% +#' mutate( +#' cens_time = runif(n(), 0.5, 3), +#' status = if_else(cens_time < time, 0, 1), +#' time = pmin(time, cens_time), +#' type = status * type) +#' temp01_2 <- survfit(Surv(time,type == 1) ~ 1, sim_df) +#' temp02_2 <- survfit(Surv(time, type == 2) ~ 1, sim_df) +#' na01_2 <- cumsum(temp01_2$n.event / temp01_2$n.risk) +#' na02_2 <- cumsum(temp02_2$n.event / temp02_2$n.risk) +#' +#' lines(temp01_2$time, na02_2, type="s", col = 3) +#' lines(temp01_2$time, na01_2, type="s", col = 4) +#' @export +sim_pexp_cr <- function(formula, data, cut) { + + Form <- Formula(formula) + F_rhs <- attr(Form, "rhs") + l_rhs <- length(F_rhs) + seq_rhs <- seq_len(l_rhs) + + data <- data %>% + mutate( + id = row_number(), + time = max(cut), + status = 1) + + # construct eta for time-constant part + ped <- split_data( + formula = Surv(time, status)~., + data = select_if (data, is_atomic), + cut = cut, + id = "id") %>% + rename("t" = "tstart") + + # calculate cause specific hazards + for(i in seq_rhs) { + ped[[paste0("hazard", i)]] <- exp(eval(F_rhs[[i]], ped)) + } + ped[["rate"]] <- reduce(ped[paste0("hazard", seq_rhs)], `+`) + + # simulate survival times + sim_df <- ped %>% + group_by(id) %>% + mutate( + time = pammtools:::rpexp(rate = .data$rate, t = .data$t), + status = 1L * (.data$time <= max(cut)), + time = pmin(.data$time, max(cut))) %>% + filter(.data$t < .data$time & .data$time <= .data$tend) + sim_df$type <- apply(sim_df[paste0("hazard", seq_rhs)], 1, + function(probs) + sample(seq_rhs, 1, prob = probs)) + + sim_df %>% + mutate(type = ifelse(.data$status == 1, .data$type, 0)) %>% + select(-one_of(c("t", "tend", "interval", "offset", "ped_status", "rate"))) + +} diff --git a/R/xgboost-fit.R b/R/xgboost-fit.R new file mode 100644 index 0000000..cc9871d --- /dev/null +++ b/R/xgboost-fit.R @@ -0,0 +1,384 @@ +#' Fit xgboost model to piece-wise exponential data +#' +#' Transforms a data set of class \code{ped} to a format suitable for +#' \code{xgboost}, then runs the xgboost model. All parameters available in +#' \code{\link[xgboost]{xgboost}} can be passed on via ellipsis. +#' The \code{base_score} is set to \code{1}. +#' @importFrom xgboost xgboost xgb.DMatrix setinfo +#' @param data An objet of class \code{ped}. +#' @param ... Further arguments passed to \code{\link[xgboost]{xgboost}}. + +#' @export +xgboost.ped <- function(data = NULL, ...) { + + attr_ped <- attributes(data) + trafo_args <- attr_ped[["trafo_args"]] + attr_ped[["trafo_args"]] <- NULL + + xgb_data <- as_xgb_data(data) + + pam_xgb <- xgboost( + xgb_data, + ..., + base_score = 1, + objective = "count:poisson") + omit_vars <- attributes(data)[c("id_var", "intvars")] %>% unlist() + keep_vars <- c("tend", setdiff(colnames(data), omit_vars)) + attr(pam_xgb, "attr_ped") <- attr_ped + pam_xgb[["orig_features"]] <- keep_vars + pam_xgb[["trafo_args"]] <- trafo_args + class(pam_xgb) <- c("pam_xgb", class(pam_xgb)) + + pam_xgb + +} + +#' PAMM wrapper for xgb.train +#' +#' @inheritParams xgboost::xgb.train +#' @importFrom xgboost xgb.train +#' @importFrom pammtools as_ped +#' @export +xgb.train.ped <- function( + params = list(), + data, + nrounds, + watchlist = list(), + obj = NULL, + feval = NULL, + verbose = 1, + print_every_n = 1L, + early_stopping_rounds = NULL, + maximize = NULL, + save_period = NULL, + save_name = "xgboost.model", + xgb_model = NULL, + callbacks = list(), + nthread = 1L, + base_score = 1, + ... +) { + + attr_ped <- attributes(data) + trafo_args <- attr_ped[["trafo_args"]] + attr_ped[["trafo_args"]] <- attr_ped[["row_names"]] <- + attr_ped[["class"]] <- NULL + + # xgb_data <- as_xgb_data() + + + pam_xgb <- xgb.train( + params = params, + data = as_xgb_data(data), + nrounds = nrounds, + watchlist = map(watchlist, ~ as_xgb_data(as_ped(data, newdata = .x))), + obj = obj, + feval = feval, + verbose = verbose, + print_every_n = print_every_n, + early_stopping_rounds = early_stopping_rounds, + maximize = maximize, + save_period = save_period, + save_name = save_name, + xgb_model = xgb_model, + callbacks = callbacks, + base_score = base_score, + objective = "count:poisson", + nthread = nthread, + ... + ) + + omit_vars <- c(attributes(data)[c("id_var", "intvars")] %>% unlist(), "ped_status") %>% + unique() + keep_vars <- unique(c("tend", setdiff(colnames(data), omit_vars))) + attr(pam_xgb, "attr_ped") <- attr_ped + pam_xgb[["trafo_args"]] <- trafo_args + pam_xgb[["orig_features"]] <- keep_vars + class(pam_xgb) <- c("pam_xgb", class(pam_xgb)) + + pam_xgb + +} + + +#' Runs xgb.train.ped on cross-validation sets +#' +#' +#' @inheritParams xgb.train.ped +#' @param nfold Number of cross-valdation folds. +#' @param ped_params List of parameters used to transform data into PED format. +#' @export +xgb.cv_pam <- function( + params = list(), + data, + nrounds, + nfold = 4, + cv_indices, + ped_params = list(), + nthread = 1L, + verbose = FALSE, + print_every_n = 1L, + early_stopping_rounds = NULL, + ... +) { + + if (missing(cv_indices)) { + cv_indices <- get_cv_indices(data$status, nfold) + } + + cv_res <- map( + cv_indices, + ~{ + if (class(data) == "list") { + data_train <- data[[1]][.x$ind_train, , drop = FALSE] + data_test <- data[[1]][.x$ind_test, , drop =FALSE] + ped_params[["data"]] <- list(data_train, data[[2]]) + watch <- list(data_test, data[[2]]) + } else { + ped_params[["data"]] <- data[.x$ind_train, , drop = FALSE] + watch <- data[.x$ind_test, , drop = FALSE] + } + # ped <- as_ped(data = ped_params$data, formula = ped_params$formula) + ped <- do.call(as_ped, ped_params) + xgb <- xgb.train.ped( + params = params, + data = ped, + watchlist = list(eval = watch), + nthread = nthread, + nrounds = nrounds, + verbose = verbose, + print_every_n = print_every_n, + early_stopping_rounds = early_stopping_rounds) + } + ) + + +} + + +#' Tune xgb pam +#' +#' @inheritParams xgb.cv_pam +#' @param param_df A data frame of parameter combinations to tune. One +#' row contains one parameter set that will be passed on to +#' \code{params} in \code{xgb.cv_pam}. +#' @importFrom prodlim Hist +#' @importFrom survival Surv +#' @importFrom pec pec crps +#' @export +xgb.tune_pam <- function( + data, + param_df, + nrounds, + cv_indices, + nfold = 4, + ped_params = list(), + nthread = 1L, + early_stopping_rounds = NULL, + verbose = FALSE, + print_every_n = 1L, + ... +) { + + param_list <- split(param_df, seq_len(nrow(param_df))) + + if (missing(cv_indices)) { + cv_indices <- get_cv_indices(data$status, nfold) + } + + tune_res <- map_dfr( + param_list, + ~ { + print(.x) + cv_res <- xgb.cv_pam( + data = data, + params = .x, + nrounds = nrounds, + nfold = nfold, + cv_indices = cv_indices, + ped_params = ped_params, + nthread = nthread, + early_stopping_rounds = early_stopping_rounds, + verbose = verbose, + print_every_n = print_every_n + ) + cv_iter_best <- purrr::map_dbl(cv_res, ~.x$best_iteration) + ibs_df <- purrr::map2_dfr( + .x = cv_res, + .y = cv_indices, + ~ { + test_dat <- data + if(class(test_dat) == "list") { + test_dat[[1]] <- data[[1]][.y$ind_test, , drop = FALSE] + } else { + test_dat <- test_dat[.y$ind_test, , drop = FALSE] + } + get_ibs( + object = .x, + data = test_dat, + pec_params = list( + formula = Hist(time, status) ~ 1, + start = .01, + exact = FALSE), + keep_reference = FALSE) + }, + .id = "fold" + ) + + ibs_df %>% + group_by(fold, cause) %>% + summarize(IBS = mean(IBS)) %>% + group_by(fold) %>% + summarize(IBS = mean(IBS)) %>% + mutate(nrounds = cv_iter_best) + + }, + .id = "id" + ) + + tune_res %>% + dplyr::group_by(id) %>% + dplyr::summarize( + IBS = mean(IBS), + nrounds = round(mean(nrounds))) %>% + dplyr::mutate(param_set = param_list) + +} + + +#' Runs xgb.train.ped on cross-validation sets +#' +#' +#' @inheritParams xgb.train.ped +#' @param nfold Number of cross-valdation folds. +#' @param ped_params List of parameters used to transform data into PED format. +#' @export +rf.cv_pam <- function( + params = list(), + data, + nrounds = 1L, + nfold = 4, + cv_indices, + ped_params = list(), + nthread = 1L, + verbose = FALSE, + print_every_n = 1L, + early_stopping_rounds = NULL, + ... +) { + + if (missing(cv_indices)) { + cv_indices <- get_cv_indices(data$status, nfold) + } + + cv_res <- map( + cv_indices, + ~{ + if (class(data) == "list") { + data_train <- data[[1]] + ped_params[["data"]] <- list(data_train, data[[2]]) + } else { + ped_params[["data"]] <- data + } + # ped <- as_ped(data = ped_params$data, formula = ped_params$formula) + ped <- do.call(as_ped, ped_params) + xgb <- xgb.train.ped( + params = params, + data = ped, + nthread = nthread, + nrounds = nrounds, + verbose = verbose, + print_every_n = print_every_n) + } + ) + + +} + + +#' Tune xgb pam +#' +#' @inheritParams rf.cv_pam +#' @param param_df A data frame of parameter combinations to tune. One +#' row contains one parameter set that will be passed on to +#' \code{params} in \code{xgb.cv_pam}. +#' @importFrom prodlim Hist +#' @importFrom survival Surv +#' @importFrom pec pec crps +#' @export +rf.tune_pam <- function( + data, + param_df, + nrounds = 1L, + cv_indices, + nfold = 4, + ped_params = list(), + nthread = 1L, + early_stopping_rounds = NULL, + verbose = FALSE, + print_every_n = 1L, + ... +) { + + param_list <- split(param_df, seq_len(nrow(param_df))) + + if (missing(cv_indices)) { + cv_indices <- get_cv_indices(data$status, nfold) + } + + tune_res <- map_dfr( + param_list, + ~ { + print(.x) + cv_res <- xgb.cv_pam( + data = data, + params = .x, + nrounds = nrounds, + nfold = nfold, + cv_indices = cv_indices, + ped_params = ped_params, + nthread = nthread, + early_stopping_rounds = early_stopping_rounds, + verbose = verbose, + print_every_n = print_every_n + ) + ibs_df <- purrr::map2_dfr( + .x = cv_res, + .y = cv_indices, + ~ { + test_dat <- data + if(class(test_dat) == "list") { + test_dat[[1]] <- data[[1]][.y$ind_test, , drop = FALSE] + } else { + test_dat <- test_dat[.y$ind_test, , drop = FALSE] + } + get_ibs( + object = .x, + data = test_dat, + pec_params = list( + formula = Hist(time, status) ~ 1, + start = .01, + exact = FALSE), + keep_reference = FALSE) + }, + .id = "fold" + ) + + ibs_df %>% + group_by(fold, cause) %>% + summarize(IBS = mean(IBS)) %>% + group_by(fold) %>% + summarize(IBS = mean(IBS)) + + }, + .id = "id" + ) + + tune_res %>% + dplyr::group_by(id) %>% + dplyr::summarize( + IBS = mean(IBS), + nrounds = round(mean(nrounds))) %>% + dplyr::mutate(param_set = param_list) + +} diff --git a/README.Rmd b/README.Rmd new file mode 100644 index 0000000..2e2400f --- /dev/null +++ b/README.Rmd @@ -0,0 +1,144 @@ +# pam.xgb +Survival analysis with xgboost +(Prototype, will have to think more about design (also **`pammtools`**) + +tunning, etc.) + +## Example +```{r} +# setup +# remotes::install_github("adibender/pammtools", ref = "master") +library(pammtools) # requires github version +library(dplyr) +library(mgcv) +devtools::load_all() + +library(ggplot2) +theme_set(theme_bw()) +``` + +### Data Simulation +```{r} +# set number of observations/subjects +n <- 500 +# create data set with variables which will affect the hazard rate. +df <- cbind.data.frame(x1 = runif (n, -3, 3), x2 = runif (n, 0, 6)) %>% + as_tibble() +# the formula which specifies how covariates affet the hazard rate +f0 <- function(t) { + dgamma(t, 8, 2) * 6 +} +form <- ~ - 3.5 + f0(t) - 0.5 * x1 + sqrt(x2) + +sim_df <- sim_pexp(form, df, seq(0, 10, by = .2)) %>% + mutate(time = round(time, 6)) +head(sim_df) +# plot(survival::survfit(survival::Surv(time, status)~1, data = sim_df )) +``` + +### Model Fitting +```{r} +# transform "standard" survival data to PED format +ped <- sim_df %>% as_ped(formula = Surv(time, status) ~ .) + +# fit PAM +pam <- gam(ped_status ~ s(tend, k = 20) + x1 + s(x2), data = ped, + family = "poisson", offset = offset, method = "REML") +summary(pam) + +# fit xgboost model +xgb_params <- list( + max_depth = 3, + eta = .01, + colsample_bytree = .7, + min_child_weight = 10, + subsample = .7) + +xgb_pam <- xgboost.ped( + ped, + params = xgb_params, + nrounds = 1500, + print_every_n = 500) +``` + +### Visualization +```{r, fig.width = 6, fig.height = 3} +## add hazard and surv_prob for xgb model +pred_df <- ped %>% make_newdata(tend = unique(ped$tend), x1 = c(-.5, 1)) +pred_df <- pred_df %>% + add_hazard2(xgb_pam) %>% + add_hazard(pam) +## add hazard and surv_prob for pam +pred_df <- pred_df %>% + group_by(x1) %>% + add_surv_prob2(xgb_pam) %>% + add_surv_prob(pam) +# add true hazard and survival probability +pred_df <- pred_df %>% + mutate( + truth_hazard = exp(-3.5 + f0(tend) -0.5 * x1 + sqrt(x2)), + truth = exp(-cumsum(truth_hazard * intlen))) + +# Hazard +ggplot(pred_df, aes(x = tend)) + + geom_stephazard(aes(y = hazard_pam_xgb), col = "green") + + geom_stephazard(aes(y = hazard), col = "black") + + geom_line(aes(y = truth_hazard), col = "blue", lty = 3) + + facet_wrap(~x1) + +# Survival probability +ggplot(pred_df, aes(x = tend)) + + geom_line(aes(y = surv_prob_pam_xgb), col = "green") + + geom_line(aes(y = surv_prob), col = "black") + + geom_line(aes(y = truth), col = "blue", lty = 3) + + scale_y_continuous(limits = c(0, 1)) + + facet_wrap(~x1) +``` + + + +## Benchmarking (illustration) +```{r} +library(survival) +library(obliqueRSF) +data("tumor", package = "pammtools") + +## estimate models for benchmarking (on subset of data) +# xgboost +xgb_params <- list( + max_depth = 3, + eta = .1, + colsample_bytree = .7, + min_child_weight = 10, + subsample = .7) + +train_idx <- sample(seq_len(nrow(tumor)), 600) +test_idx <- setdiff(seq_len(nrow(tumor)), train_idx) +## pam xgboost model +pxgb <- xgboost.ped( + data = as_ped(tumor[train_idx, ], Surv(days, status) ~ .), + params = xgb_params, + nrounds = 400, + print_every_n = 500L) +## cox ph +mcox <- coxph(Surv(days, status)~., data = tumor[train_idx, ], x = TRUE) +## oblique RSF +orsf <- ORSF(tumor[train_idx,], time = "days", verbose = FALSE) +``` + + + +```{r, fig.width = 5, fig.height = 5} +## Evaluation +library(pec) +# prediction error curve (Brier Score) +pec1 <- pec( + list(pam_xgb = pxgb, coxph = mcox, orsf = orsf), + Surv(days, status) ~ 1, + exact = FALSE, + data = tumor[test_idx, ], + times = seq(1, 2500, by = 10), + start = 1) +plot(pec1) +## Integrated Brier Score at 4 time points +crps(pec1, times = c(500, 1000, 1500, 2000, 2500)) +``` diff --git a/man/add_hazard2.Rd b/man/add_hazard2.Rd new file mode 100644 index 0000000..7366701 --- /dev/null +++ b/man/add_hazard2.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/add-functions.R +\name{add_hazard2} +\alias{add_hazard2} +\alias{add_cumu_hazard2} +\title{Add predicted (cumulative) hazard to data set} +\usage{ +add_hazard2( + newdata, + object, + reference = NULL, + overwrite = FALSE, + time_var = NULL, + ... +) + +add_cumu_hazard2( + newdata, + object, + overwrite = FALSE, + time_var = NULL, + interval_length = "intlen", + ... +) +} +\arguments{ +\item{newdata}{ A data frame or list containing the values of the model covariates at which predictions + are required. If this is not provided then predictions corresponding to the + original data are returned. If \code{newdata} is provided then + it should contain all the variables needed for prediction: a + warning is generated if not. See details for use with \code{link{linear.functional.terms}}. } + +\item{object}{ a fitted \code{gam} object as produced by \code{gam()}. + } + +\item{reference}{A data frame with number of rows equal to \code{nrow(newdata)} or +one, or a named list with (partial) covariate specifications. See examples.} + +\item{overwrite}{Should hazard columns be overwritten if already present in +the data set? Defaults to \code{FALSE}.} + +\item{time_var}{Name of the variable used for the baseline hazard. If +not given, defaults to \code{"tend"} for \code{\link[mgcv]{gam}} fits, else +\code{"interval"}. The latter is assumed to be a factor, the former +numeric.} + +\item{...}{Further arguments passed to \code{\link[xgboost]{predict.xgb.Booster}}.} + +\item{interval_length}{The variable in newdata containing the interval lengths. +Can be either bare unquoted variable name or character. Defaults to \code{"intlen"}.} +} +\description{ +Add (cumulative) hazard based on the provided data set and model. +} diff --git a/man/add_surv_prob2.Rd b/man/add_surv_prob2.Rd new file mode 100644 index 0000000..eba6f33 --- /dev/null +++ b/man/add_surv_prob2.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/add-functions.R +\name{add_surv_prob2} +\alias{add_surv_prob2} +\title{Add survival probability estimates} +\usage{ +add_surv_prob2( + newdata, + object, + overwrite = FALSE, + time_var = NULL, + interval_length = "intlen", + ... +) +} +\arguments{ +\item{newdata}{ A data frame or list containing the values of the model covariates at which predictions + are required. If this is not provided then predictions corresponding to the + original data are returned. If \code{newdata} is provided then + it should contain all the variables needed for prediction: a + warning is generated if not. See details for use with \code{link{linear.functional.terms}}. } + +\item{object}{ a fitted \code{gam} object as produced by \code{gam()}. + } + +\item{overwrite}{Should hazard columns be overwritten if already present in +the data set? Defaults to \code{FALSE}.} + +\item{time_var}{Name of the variable used for the baseline hazard. If +not given, defaults to \code{"tend"} for \code{\link[mgcv]{gam}} fits, else +\code{"interval"}. The latter is assumed to be a factor, the former +numeric.} + +\item{interval_length}{The variable in newdata containing the interval lengths. +Can be either bare unquoted variable name or character. Defaults to \code{"intlen"}.} + +\item{...}{Further arguments passed to \code{\link[xgboost]{predict.xgb.Booster}}.} +} +\description{ +Given suitable data (i.e. data with all columns used for estimation of the model), +this functions adds a column \code{surv_prob} containing survival probabilities +for the specified covariate and follow-up information. +} diff --git a/man/as.data.frame.crps.Rd b/man/as.data.frame.crps.Rd new file mode 100644 index 0000000..3c97225 --- /dev/null +++ b/man/as.data.frame.crps.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pec-helpers.R +\name{as.data.frame.crps} +\alias{as.data.frame.crps} +\title{Transform crps object to data.frame} +\usage{ +\method{as.data.frame}{crps}(x, row.names = NULL, optional = FALSE, ...) +} +\arguments{ +\item{x}{An object of class \code{crps}. See \code{\link[pec]{crps}}.} + +\item{row.names}{\code{NULL} or a character vector giving the row + names for the data frame. Missing values are not allowed.} + +\item{optional}{logical. If \code{TRUE}, setting row names and + converting column names (to syntactic names: see + \code{\link[base]{make.names}}) is optional. Note that all of \R's + \pkg{base} package \code{as.data.frame()} methods use + \code{optional} only for column names treatment, basically with the + meaning of \code{\link[base]{data.frame}(*, check.names = !optional)}. + See also the \code{make.names} argument of the \code{matrix} method.} + +\item{...}{additional arguments to be passed to or from methods.} +} +\description{ +A\code{as.data.frame} S3 method for objects of class \code{\link[pec]{crps}}. +} diff --git a/man/as_ped.Rd b/man/as_ped.Rd new file mode 100644 index 0000000..37df262 --- /dev/null +++ b/man/as_ped.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data-trafo-helpers.R +\name{as_ped.pam_xgb} +\alias{as_ped.pam_xgb} +\alias{as_ped.ped} +\title{Transform newdata to PED format based on fitted xgb model} +\usage{ +\method{as_ped}{pam_xgb}(data, newdata, ...) + +\method{as_ped}{ped}(data, newdata, ...) +} +\description{ +Transform newdata to PED format based on fitted xgb model +} diff --git a/man/as_xgb_data.Rd b/man/as_xgb_data.Rd new file mode 100644 index 0000000..f47c777 --- /dev/null +++ b/man/as_xgb_data.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data-trafo-helpers.R +\name{as_xgb_data} +\alias{as_xgb_data} +\alias{as_xgb_data.data.frame} +\alias{as_xgb_data.ped} +\title{Transform data suitable for xgboost applications} +\usage{ +as_xgb_data(x, ...) + +\method{as_xgb_data}{data.frame}(x, features, label, set_margin = TRUE, ...) + +\method{as_xgb_data}{ped}(x, ...) +} +\description{ +Transform data suitable for xgboost applications +} diff --git a/man/as_xgb_newdata.Rd b/man/as_xgb_newdata.Rd new file mode 100644 index 0000000..a3fed89 --- /dev/null +++ b/man/as_xgb_newdata.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data-trafo-helpers.R +\name{as_xgb_newdata} +\alias{as_xgb_newdata} +\title{Transform new data to xgb-PED format using same arguments as previous transformation} +\usage{ +as_xgb_newdata(x, newdata, ...) +} +\description{ +Transform new data to xgb-PED format using same arguments as previous transformation +} +\keyword{internal} diff --git a/man/as_xgb_newdata.pam_xgb.Rd b/man/as_xgb_newdata.pam_xgb.Rd new file mode 100644 index 0000000..e034604 --- /dev/null +++ b/man/as_xgb_newdata.pam_xgb.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data-trafo-helpers.R +\name{as_xgb_newdata.pam_xgb} +\alias{as_xgb_newdata.pam_xgb} +\title{Transform new data to xgb-PED format using same arguments as previous transformation} +\usage{ +\method{as_xgb_newdata}{pam_xgb}(x, newdata, ...) +} +\description{ +Transform new data to xgb-PED format using same arguments as previous transformation +} +\keyword{internal} diff --git a/man/as_xgb_newdata.ped.Rd b/man/as_xgb_newdata.ped.Rd new file mode 100644 index 0000000..1e49a60 --- /dev/null +++ b/man/as_xgb_newdata.ped.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data-trafo-helpers.R +\name{as_xgb_newdata.ped} +\alias{as_xgb_newdata.ped} +\title{Transform new data to xgb-PED format using same arguments as previous transformation} +\usage{ +\method{as_xgb_newdata}{ped}(x, newdata, ...) +} +\description{ +Transform new data to xgb-PED format using same arguments as previous transformation +} +\keyword{internal} diff --git a/man/get_cindex.Rd b/man/get_cindex.Rd new file mode 100644 index 0000000..aec7a5e --- /dev/null +++ b/man/get_cindex.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pec-helpers.R +\name{get_cindex} +\alias{get_cindex} +\title{C-Index from pec package} +\usage{ +get_cindex( + object, + data, + cind_params = list(formula = Surv(time, status) ~ 1, exact = FALSE, start = 0.01), + eval_times = NULL, + q_last = 0.8, + q_eval = c(0.25, 0.5, 0.75) +) +} +\description{ +C-Index from pec package +} diff --git a/man/get_cumu_hazard2.Rd b/man/get_cumu_hazard2.Rd new file mode 100644 index 0000000..b9f479c --- /dev/null +++ b/man/get_cumu_hazard2.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/add-functions.R +\name{get_cumu_hazard2} +\alias{get_cumu_hazard2} +\title{Calculate cumulative hazard} +\usage{ +get_cumu_hazard2( + object, + newdata, + time_var = NULL, + interval_length = "intlen", + ... +) +} +\arguments{ +\item{object}{ a fitted \code{gam} object as produced by \code{gam()}. + } + +\item{newdata}{ A data frame or list containing the values of the model covariates at which predictions + are required. If this is not provided then predictions corresponding to the + original data are returned. If \code{newdata} is provided then + it should contain all the variables needed for prediction: a + warning is generated if not. See details for use with \code{link{linear.functional.terms}}. } + +\item{time_var}{Name of the variable used for the baseline hazard. If +not given, defaults to \code{"tend"} for \code{\link[mgcv]{gam}} fits, else +\code{"interval"}. The latter is assumed to be a factor, the former +numeric.} + +\item{interval_length}{The variable in newdata containing the interval lengths. +Can be either bare unquoted variable name or character. Defaults to \code{"intlen"}.} + +\item{...}{Further arguments passed to \code{\link[xgboost]{predict.xgb.Booster}}.} +} +\description{ +Calculate cumulative hazard +} +\keyword{internal} diff --git a/man/get_cv_indices.Rd b/man/get_cv_indices.Rd new file mode 100644 index 0000000..b668632 --- /dev/null +++ b/man/get_cv_indices.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cv-splits.R +\name{get_cv_indices} +\alias{get_cv_indices} +\title{Create a lst of train/test indices} +\usage{ +get_cv_indices(y, nfold = 4) +} +\arguments{ +\item{nfold}{number of folds for cross-validation.} + +\item{n}{number of rows in data set.} +} +\description{ +Create a lst of train/test indices +} +\keyword{internal} diff --git a/man/get_ibs.Rd b/man/get_ibs.Rd new file mode 100644 index 0000000..f10e7c0 --- /dev/null +++ b/man/get_ibs.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pec-helpers.R +\name{get_ibs} +\alias{get_ibs} +\title{Extract IBS in tidy format} +\usage{ +get_ibs( + object, + data, + pec_params = list(formula = Surv(time, status) ~ 1, exact = FALSE, start = 0.01), + keep_reference = TRUE, + q_last = 0.8, + q_eval = c(0.25, 0.5, 0.75), + ... +) +} +\arguments{ +\item{object}{A named list of prediction models, where allowed entries are +(1) R-objects for which a \link[pec]{predictSurvProb} method exists (see +details), (2) a \code{call} that evaluates to such an R-object (see +examples), (3) a matrix with predicted probabilities having as many rows as +\code{data} and as many columns as \code{times}. For cross-validation all +objects in this list must include their \code{call}.} + +\item{data}{A data frame in which to validate the prediction models and to +fit the censoring model. If \code{data} is missing, try to extract a data +set from the first element in object.} + +\item{keep_reference}{Logical. Should Brier Score of Reference model be kept +in results table.} + +\item{...}{Not used.} +} +\description{ +Extract IBS in tidy format +} diff --git a/man/get_split_indices.Rd b/man/get_split_indices.Rd new file mode 100644 index 0000000..75081a3 --- /dev/null +++ b/man/get_split_indices.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cv-splits.R +\name{get_split_indices} +\alias{get_split_indices} +\title{Split data into training and eval data for watchlist} +\usage{ +get_split_indices(n, split_frac = 0.7) +} +\description{ +Split data into training and eval data for watchlist +} +\keyword{internal} diff --git a/man/get_surv_prob2.Rd b/man/get_surv_prob2.Rd new file mode 100644 index 0000000..74a7617 --- /dev/null +++ b/man/get_surv_prob2.Rd @@ -0,0 +1,38 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/add-functions.R +\name{get_surv_prob2} +\alias{get_surv_prob2} +\title{Calculate survival probabilities} +\usage{ +get_surv_prob2( + object, + newdata, + time_var = NULL, + interval_length = "intlen", + ... +) +} +\arguments{ +\item{object}{ a fitted \code{gam} object as produced by \code{gam()}. + } + +\item{newdata}{ A data frame or list containing the values of the model covariates at which predictions + are required. If this is not provided then predictions corresponding to the + original data are returned. If \code{newdata} is provided then + it should contain all the variables needed for prediction: a + warning is generated if not. See details for use with \code{link{linear.functional.terms}}. } + +\item{time_var}{Name of the variable used for the baseline hazard. If +not given, defaults to \code{"tend"} for \code{\link[mgcv]{gam}} fits, else +\code{"interval"}. The latter is assumed to be a factor, the former +numeric.} + +\item{interval_length}{The variable in newdata containing the interval lengths. +Can be either bare unquoted variable name or character. Defaults to \code{"intlen"}.} + +\item{...}{Further arguments passed to \code{\link[xgboost]{predict.xgb.Booster}}.} +} +\description{ +Calculate survival probabilities +} +\keyword{internal} diff --git a/man/predict.pam_xgb.Rd b/man/predict.pam_xgb.Rd new file mode 100644 index 0000000..4ffb3ed --- /dev/null +++ b/man/predict.pam_xgb.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/predict.R +\name{predict.pam_xgb} +\alias{predict.pam_xgb} +\title{Predict hazard, cumulative hazard or survival probability} +\usage{ +\method{predict}{pam_xgb}(object, newdata, type = c("hazard", "cumu_hazard", "surv_prob"), ...) +} +\arguments{ +\item{object}{An object of class \code{pam_xgb}} + +\item{newdata}{A data set containing the same covariates as used for model +fitting. If of class \code{data.frame}, the function will try to transform +to the \code{xgb.DMatrix} format.} + +\item{type}{The type of prediction desired. Either hazard (\code{type = "hazard"}), +cumulative hazard (\code{type = "cumu_hazard"}) or survival probability +(\code{type = "surv_prob"}).} + +\item{times}{A vector of times for which predictions should be generated} +} +\value{ +A matrix of predictions containing one row per +observation (row in newdata) and 1 column per specified time in the +\code{times} argument. +} +\description{ +Predict hazard, cumulative hazard or survival probability +} diff --git a/man/predictEventProb.pam_xgb.Rd b/man/predictEventProb.pam_xgb.Rd new file mode 100644 index 0000000..c114e37 --- /dev/null +++ b/man/predictEventProb.pam_xgb.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/predict.R +\name{predictEventProb.pam_xgb} +\alias{predictEventProb.pam_xgb} +\title{S3 method for compatibility with package pec} +\usage{ +\method{predictEventProb}{pam_xgb}(object, newdata, times, cause, ...) +} +\description{ +This function is needed to use \code{pec::pec} in the competing risks setting. +} diff --git a/man/predictSurvProb.pam_xgb.Rd b/man/predictSurvProb.pam_xgb.Rd new file mode 100644 index 0000000..037e402 --- /dev/null +++ b/man/predictSurvProb.pam_xgb.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/predict.R +\name{predictSurvProb.pam_xgb} +\alias{predictSurvProb.pam_xgb} +\title{S3 method for compatibility with package pec} +\usage{ +\method{predictSurvProb}{pam_xgb}(object, newdata, times) +} +\description{ +S3 method for compatibility with package pec +} diff --git a/man/random_params.Rd b/man/random_params.Rd new file mode 100644 index 0000000..26d25da --- /dev/null +++ b/man/random_params.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cv-splits.R +\name{random_params} +\alias{random_params} +\title{Draw random parameters from prespecified parameter space} +\usage{ +random_params(param_space) +} +\description{ +Checks if integer or double. Samples integers or draws from uniform distribution +respectively. +} diff --git a/man/rf.cv_pam.Rd b/man/rf.cv_pam.Rd new file mode 100644 index 0000000..368dad0 --- /dev/null +++ b/man/rf.cv_pam.Rd @@ -0,0 +1,101 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgboost-fit.R +\name{rf.cv_pam} +\alias{rf.cv_pam} +\title{Runs xgb.train.ped on cross-validation sets} +\usage{ +rf.cv_pam( + params = list(), + data, + nrounds = 1L, + nfold = 4, + cv_indices, + ped_params = list(), + nthread = 1L, + verbose = FALSE, + print_every_n = 1L, + early_stopping_rounds = NULL, + ... +) +} +\arguments{ +\item{params}{the list of parameters. + The complete list of parameters is available at \url{http://xgboost.readthedocs.io/en/latest/parameter.html}. + Below is a shorter summary: + +1. General Parameters + +\itemize{ + \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}. +} + +2. Booster Parameters + +2.1. Parameter for Tree Booster + +\itemize{ + \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3 + \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. + \item \code{max_depth} maximum depth of a tree. Default: 6 + \item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1 + \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nrounds}. Default: 1 + \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1 + \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1 + \item \code{monotone_constraints} A numerical vector consists of \code{1}, \code{0} and \code{-1} with its length equals to the number of features in the training data. \code{1} is increasing, \code{-1} is decreasing and \code{0} is no constraint. + \item \code{interaction_constraints} A list of vectors specifying feature indices of permitted interactions. Each item of the list represents one permitted interaction where specified features are allowed to interact with each other. Feature index values should start from \code{0} (\code{0} references the first column). Leave argument unspecified for no interaction constraints. +} + +2.2. Parameter for Linear Booster + +\itemize{ + \item \code{lambda} L2 regularization term on weights. Default: 0 + \item \code{lambda_bias} L2 regularization term on bias. Default: 0 + \item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0 +} + +3. Task Parameters + +\itemize{ +\item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below: + \itemize{ + \item \code{reg:squarederror} Regression with squared loss (Default). + \item \code{reg:logistic} logistic regression. + \item \code{binary:logistic} logistic regression for binary classification. Output probability. + \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation. + \item \code{num_class} set the number of classes. To use only with multiclass objectives. + \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class - 1}. + \item \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class. + \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss. + } + \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5 + \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section. +}} + +\item{data}{training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input. +\code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.} + +\item{nrounds}{max number of boosting iterations.} + +\item{nfold}{Number of cross-valdation folds.} + +\item{ped_params}{List of parameters used to transform data into PED format.} + +\item{verbose}{If 0, xgboost will stay silent. If 1, it will print information about performance. +If 2, some additional information will be printed out. +Note that setting \code{verbose > 0} automatically engages the +\code{cb.print.evaluation(period=1)} callback function.} + +\item{print_every_n}{Print each n-th iteration evaluation messages when \code{verbose>0}. +Default is 1 which means all messages are printed. This parameter is passed to the +\code{\link[xgboost]{cb.print.evaluation}} callback.} + +\item{early_stopping_rounds}{If \code{NULL}, the early stopping function is not triggered. +If set to an integer \code{k}, training with a validation set will stop if the performance +doesn't improve for \code{k} rounds. +Setting this parameter engages the \code{\link[xgboost]{cb.early.stop}} callback.} + +\item{...}{other parameters to pass to \code{params}.} +} +\description{ +Runs xgb.train.ped on cross-validation sets +} diff --git a/man/rf.tune_pam.Rd b/man/rf.tune_pam.Rd new file mode 100644 index 0000000..708d2c3 --- /dev/null +++ b/man/rf.tune_pam.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgboost-fit.R +\name{rf.tune_pam} +\alias{rf.tune_pam} +\title{Tune xgb pam} +\usage{ +rf.tune_pam( + data, + param_df, + nrounds = 1L, + cv_indices, + nfold = 4, + ped_params = list(), + nthread = 1L, + early_stopping_rounds = NULL, + verbose = FALSE, + print_every_n = 1L, + ... +) +} +\arguments{ +\item{data}{training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input. +\code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.} + +\item{param_df}{A data frame of parameter combinations to tune. One +row contains one parameter set that will be passed on to +\code{params} in \code{xgb.cv_pam}.} + +\item{nrounds}{max number of boosting iterations.} + +\item{nfold}{Number of cross-valdation folds.} + +\item{ped_params}{List of parameters used to transform data into PED format.} + +\item{early_stopping_rounds}{If \code{NULL}, the early stopping function is not triggered. +If set to an integer \code{k}, training with a validation set will stop if the performance +doesn't improve for \code{k} rounds. +Setting this parameter engages the \code{\link[xgboost]{cb.early.stop}} callback.} + +\item{verbose}{If 0, xgboost will stay silent. If 1, it will print information about performance. +If 2, some additional information will be printed out. +Note that setting \code{verbose > 0} automatically engages the +\code{cb.print.evaluation(period=1)} callback function.} + +\item{print_every_n}{Print each n-th iteration evaluation messages when \code{verbose>0}. +Default is 1 which means all messages are printed. This parameter is passed to the +\code{\link[xgboost]{cb.print.evaluation}} callback.} + +\item{...}{other parameters to pass to \code{params}.} +} +\description{ +Tune xgb pam +} diff --git a/man/sim_pexp_cr.Rd b/man/sim_pexp_cr.Rd new file mode 100644 index 0000000..1d7d7e0 --- /dev/null +++ b/man/sim_pexp_cr.Rd @@ -0,0 +1,75 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sim-cr.R +\name{sim_pexp_cr} +\alias{sim_pexp_cr} +\title{Simulate competing risks time-to-event data via piece-wise exponential distribution} +\usage{ +sim_pexp_cr(formula, data, cut) +} +\arguments{ +\item{formula}{An extended formula that specifies the linear predictor. +If you want to include a smooth baseline +or time-varying effects, use \code{t} within your formula as +if it was a covariate in the data, although it is not and should not +be included in the \code{data} provided to \code{sim_pexp}. See examples +below.} + +\item{data}{A data set with variables specified in \code{formula}.} + +\item{cut}{A sequence of time-points starting with 0.} +} +\description{ +Piece-wise exponential implementation of simulation algorithm described in +Beyersmann et al. 2009 (). +} +\examples{ +library(pammtools) +library(dplyr) +library(survival) +# Example from Competing Risks book for comparison +simul.dat.cp <- function(n, h01, h02, cens.param) { + times <- rexp(n, h01 + h02) + ev <- rbinom(n, size = 1, prob = h01 / (h01 + h02)) + ev <- ifelse(ev == 0, 2, 1) + cens.time <- runif(n, cens.param[1], cens.param[2]) + obs.time <- pmin(times, cens.time) + obs.cause <- as.numeric(times == obs.time) * ev + data.frame(obs.time, obs.cause) +} +set.seed(923) +n <- 1000 +dat.exo7 <- simul.dat.cp(n, 0.5, 0.9, c(0.5, 3)) +# We compute the Nelson-Aalen estimates using survfit() +# That’s just to get the number of events and the risk set +temp01 <- survfit(Surv(obs.time, obs.cause == 1) ~ 1, dat.exo7) +temp02 <- survfit(Surv(obs.time, obs.cause == 2) ~ 1, dat.exo7) +na01 <- cumsum(temp01$n.event / temp01$n.risk) +na02 <- cumsum(temp02$n.event / temp02$n.risk) + +plot(temp01$time, na02, type="s", ylab="Cumulative transition hazard", xlab="time") +lines(temp01$time, na01, type="s", col=2) + +# create data set with variables which will affect the hazard rate +# (not used here, but usually more complex examples of interest) +df <- cbind.data.frame(x1 = runif (n, -3, 3), x2 = runif (n, 0, 6)) \%>\% + as_tibble() +set.seed(24032018) +df <- cbind.data.frame( + x1 = runif (n, -3, 3), + x2 = runif (n, 0, 6)) +# two component formula specifying cause specific hazards +form <- ~ log(0.5)| log(0.9) +sim_df <- sim_pexp_cr(form, df, seq(0, 3, by =.25)) \%>\% + mutate( + cens_time = runif(n(), 0.5, 3), + status = if_else(cens_time < time, 0, 1), + time = pmin(time, cens_time), + type = status * type) +temp01_2 <- survfit(Surv(time,type == 1) ~ 1, sim_df) +temp02_2 <- survfit(Surv(time, type == 2) ~ 1, sim_df) +na01_2 <- cumsum(temp01_2$n.event / temp01_2$n.risk) +na02_2 <- cumsum(temp02_2$n.event / temp02_2$n.risk) + +lines(temp01_2$time, na02_2, type="s", col = 3) +lines(temp01_2$time, na01_2, type="s", col = 4) +} diff --git a/man/times_list.Rd b/man/times_list.Rd new file mode 100644 index 0000000..da28e62 --- /dev/null +++ b/man/times_list.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pec-helpers.R +\name{times_list} +\alias{times_list} +\title{Look up Quantiles of event times in data} +\usage{ +times_list( + data, + q_last = 0.8, + q_eval = c(0.25, 0.5, 0.75), + time_var = "time", + status_var = "status", + status_value = 1 +) +} +\description{ +Look up Quantiles of event times in data +} +\keyword{internal} diff --git a/man/xgb.cv_pam.Rd b/man/xgb.cv_pam.Rd new file mode 100644 index 0000000..cd4e5ab --- /dev/null +++ b/man/xgb.cv_pam.Rd @@ -0,0 +1,101 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgboost-fit.R +\name{xgb.cv_pam} +\alias{xgb.cv_pam} +\title{Runs xgb.train.ped on cross-validation sets} +\usage{ +xgb.cv_pam( + params = list(), + data, + nrounds, + nfold = 4, + cv_indices, + ped_params = list(), + nthread = 1L, + verbose = FALSE, + print_every_n = 1L, + early_stopping_rounds = NULL, + ... +) +} +\arguments{ +\item{params}{the list of parameters. + The complete list of parameters is available at \url{http://xgboost.readthedocs.io/en/latest/parameter.html}. + Below is a shorter summary: + +1. General Parameters + +\itemize{ + \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}. +} + +2. Booster Parameters + +2.1. Parameter for Tree Booster + +\itemize{ + \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3 + \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. + \item \code{max_depth} maximum depth of a tree. Default: 6 + \item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1 + \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nrounds}. Default: 1 + \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1 + \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1 + \item \code{monotone_constraints} A numerical vector consists of \code{1}, \code{0} and \code{-1} with its length equals to the number of features in the training data. \code{1} is increasing, \code{-1} is decreasing and \code{0} is no constraint. + \item \code{interaction_constraints} A list of vectors specifying feature indices of permitted interactions. Each item of the list represents one permitted interaction where specified features are allowed to interact with each other. Feature index values should start from \code{0} (\code{0} references the first column). Leave argument unspecified for no interaction constraints. +} + +2.2. Parameter for Linear Booster + +\itemize{ + \item \code{lambda} L2 regularization term on weights. Default: 0 + \item \code{lambda_bias} L2 regularization term on bias. Default: 0 + \item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0 +} + +3. Task Parameters + +\itemize{ +\item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below: + \itemize{ + \item \code{reg:squarederror} Regression with squared loss (Default). + \item \code{reg:logistic} logistic regression. + \item \code{binary:logistic} logistic regression for binary classification. Output probability. + \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation. + \item \code{num_class} set the number of classes. To use only with multiclass objectives. + \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class - 1}. + \item \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class. + \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss. + } + \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5 + \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section. +}} + +\item{data}{training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input. +\code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.} + +\item{nrounds}{max number of boosting iterations.} + +\item{nfold}{Number of cross-valdation folds.} + +\item{ped_params}{List of parameters used to transform data into PED format.} + +\item{verbose}{If 0, xgboost will stay silent. If 1, it will print information about performance. +If 2, some additional information will be printed out. +Note that setting \code{verbose > 0} automatically engages the +\code{cb.print.evaluation(period=1)} callback function.} + +\item{print_every_n}{Print each n-th iteration evaluation messages when \code{verbose>0}. +Default is 1 which means all messages are printed. This parameter is passed to the +\code{\link[xgboost]{cb.print.evaluation}} callback.} + +\item{early_stopping_rounds}{If \code{NULL}, the early stopping function is not triggered. +If set to an integer \code{k}, training with a validation set will stop if the performance +doesn't improve for \code{k} rounds. +Setting this parameter engages the \code{\link[xgboost]{cb.early.stop}} callback.} + +\item{...}{other parameters to pass to \code{params}.} +} +\description{ +Runs xgb.train.ped on cross-validation sets +} diff --git a/man/xgb.train.ped.Rd b/man/xgb.train.ped.Rd new file mode 100644 index 0000000..1e73243 --- /dev/null +++ b/man/xgb.train.ped.Rd @@ -0,0 +1,138 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgboost-fit.R +\name{xgb.train.ped} +\alias{xgb.train.ped} +\title{PAMM wrapper for xgb.train} +\usage{ +xgb.train.ped( + params = list(), + data, + nrounds, + watchlist = list(), + obj = NULL, + feval = NULL, + verbose = 1, + print_every_n = 1L, + early_stopping_rounds = NULL, + maximize = NULL, + save_period = NULL, + save_name = "xgboost.model", + xgb_model = NULL, + callbacks = list(), + nthread = 1L, + base_score = 1, + ... +) +} +\arguments{ +\item{params}{the list of parameters. + The complete list of parameters is available at \url{http://xgboost.readthedocs.io/en/latest/parameter.html}. + Below is a shorter summary: + +1. General Parameters + +\itemize{ + \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}. +} + +2. Booster Parameters + +2.1. Parameter for Tree Booster + +\itemize{ + \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3 + \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. + \item \code{max_depth} maximum depth of a tree. Default: 6 + \item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1 + \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nrounds}. Default: 1 + \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1 + \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample < 1} and \code{round = 1}) accordingly. Default: 1 + \item \code{monotone_constraints} A numerical vector consists of \code{1}, \code{0} and \code{-1} with its length equals to the number of features in the training data. \code{1} is increasing, \code{-1} is decreasing and \code{0} is no constraint. + \item \code{interaction_constraints} A list of vectors specifying feature indices of permitted interactions. Each item of the list represents one permitted interaction where specified features are allowed to interact with each other. Feature index values should start from \code{0} (\code{0} references the first column). Leave argument unspecified for no interaction constraints. +} + +2.2. Parameter for Linear Booster + +\itemize{ + \item \code{lambda} L2 regularization term on weights. Default: 0 + \item \code{lambda_bias} L2 regularization term on bias. Default: 0 + \item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0 +} + +3. Task Parameters + +\itemize{ +\item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below: + \itemize{ + \item \code{reg:squarederror} Regression with squared loss (Default). + \item \code{reg:logistic} logistic regression. + \item \code{binary:logistic} logistic regression for binary classification. Output probability. + \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation. + \item \code{num_class} set the number of classes. To use only with multiclass objectives. + \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class - 1}. + \item \code{multi:softprob} same as softmax, but prediction outputs a vector of ndata * nclass elements, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class. + \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss. + } + \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5 + \item \code{eval_metric} evaluation metrics for validation data. Users can pass a self-defined function to it. Default: metric will be assigned according to objective(rmse for regression, and error for classification, mean average precision for ranking). List is provided in detail section. +}} + +\item{data}{training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input. +\code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.} + +\item{nrounds}{max number of boosting iterations.} + +\item{watchlist}{named list of xgb.DMatrix datasets to use for evaluating model performance. +Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each +of these datasets during each boosting iteration, and stored in the end as a field named +\code{evaluation_log} in the resulting object. When either \code{verbose>=1} or +\code{\link[xgboost]{cb.print.evaluation}} callback is engaged, the performance results are continuously +printed out during the training. +E.g., specifying \code{watchlist=list(validation1=mat1, validation2=mat2)} allows to track +the performance of each round's model on mat1 and mat2.} + +\item{obj}{customized objective function. Returns gradient and second order +gradient with given prediction and dtrain.} + +\item{feval}{customized evaluation function. Returns +\code{list(metric='metric-name', value='metric-value')} with given +prediction and dtrain.} + +\item{verbose}{If 0, xgboost will stay silent. If 1, it will print information about performance. +If 2, some additional information will be printed out. +Note that setting \code{verbose > 0} automatically engages the +\code{cb.print.evaluation(period=1)} callback function.} + +\item{print_every_n}{Print each n-th iteration evaluation messages when \code{verbose>0}. +Default is 1 which means all messages are printed. This parameter is passed to the +\code{\link[xgboost]{cb.print.evaluation}} callback.} + +\item{early_stopping_rounds}{If \code{NULL}, the early stopping function is not triggered. +If set to an integer \code{k}, training with a validation set will stop if the performance +doesn't improve for \code{k} rounds. +Setting this parameter engages the \code{\link[xgboost]{cb.early.stop}} callback.} + +\item{maximize}{If \code{feval} and \code{early_stopping_rounds} are set, +then this parameter must be set as well. +When it is \code{TRUE}, it means the larger the evaluation score the better. +This parameter is passed to the \code{\link[xgboost]{cb.early.stop}} callback.} + +\item{save_period}{when it is non-NULL, model is saved to disk after every \code{save_period} rounds, +0 means save at the end. The saving is handled by the \code{\link[xgboost]{cb.save.model}} callback.} + +\item{save_name}{the name or path for periodically saved model file.} + +\item{xgb_model}{a previously built model to continue the training from. +Could be either an object of class \code{xgb.Booster}, or its raw data, or the name of a +file with a previously saved model.} + +\item{callbacks}{a list of callback functions to perform various task during boosting. +See \code{\link[xgboost]{callbacks}}. Some of the callbacks are automatically created depending on the +parameters' values. User can provide either existing or their own callback methods in order +to customize the training process.} + +\item{...}{other parameters to pass to \code{params}.} +} +\description{ +PAMM wrapper for xgb.train +} diff --git a/man/xgb.tune_pam.Rd b/man/xgb.tune_pam.Rd new file mode 100644 index 0000000..bc67bc5 --- /dev/null +++ b/man/xgb.tune_pam.Rd @@ -0,0 +1,53 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgboost-fit.R +\name{xgb.tune_pam} +\alias{xgb.tune_pam} +\title{Tune xgb pam} +\usage{ +xgb.tune_pam( + data, + param_df, + nrounds, + cv_indices, + nfold = 4, + ped_params = list(), + nthread = 1L, + early_stopping_rounds = NULL, + verbose = FALSE, + print_every_n = 1L, + ... +) +} +\arguments{ +\item{data}{training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input. +\code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.} + +\item{param_df}{A data frame of parameter combinations to tune. One +row contains one parameter set that will be passed on to +\code{params} in \code{xgb.cv_pam}.} + +\item{nrounds}{max number of boosting iterations.} + +\item{nfold}{Number of cross-valdation folds.} + +\item{ped_params}{List of parameters used to transform data into PED format.} + +\item{early_stopping_rounds}{If \code{NULL}, the early stopping function is not triggered. +If set to an integer \code{k}, training with a validation set will stop if the performance +doesn't improve for \code{k} rounds. +Setting this parameter engages the \code{\link[xgboost]{cb.early.stop}} callback.} + +\item{verbose}{If 0, xgboost will stay silent. If 1, it will print information about performance. +If 2, some additional information will be printed out. +Note that setting \code{verbose > 0} automatically engages the +\code{cb.print.evaluation(period=1)} callback function.} + +\item{print_every_n}{Print each n-th iteration evaluation messages when \code{verbose>0}. +Default is 1 which means all messages are printed. This parameter is passed to the +\code{\link[xgboost]{cb.print.evaluation}} callback.} + +\item{...}{other parameters to pass to \code{params}.} +} +\description{ +Tune xgb pam +} diff --git a/man/xgboost.ped.Rd b/man/xgboost.ped.Rd new file mode 100644 index 0000000..799435b --- /dev/null +++ b/man/xgboost.ped.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgboost-fit.R +\name{xgboost.ped} +\alias{xgboost.ped} +\title{Fit xgboost model to piece-wise exponential data} +\usage{ +xgboost.ped(data = NULL, ...) +} +\arguments{ +\item{data}{An objet of class \code{ped}.} + +\item{...}{Further arguments passed to \code{\link[xgboost]{xgboost}}.} +} +\description{ +Transforms a data set of class \code{ped} to a format suitable for +\code{xgboost}, then runs the xgboost model. All parameters available in +\code{\link[xgboost]{xgboost}} can be passed on via ellipsis. +The \code{base_score} is set to \code{1}. +} diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..2f7e735 --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,4 @@ +library(testthat) +library(pamm.xgb) + +test_check("pamm.xgb") diff --git a/tests/testthat/test-prediction-xgboost.R b/tests/testthat/test-prediction-xgboost.R new file mode 100644 index 0000000..f7d8d5b --- /dev/null +++ b/tests/testthat/test-prediction-xgboost.R @@ -0,0 +1,21 @@ +context("Test prediction functions for xgboost objects") + +test_that("Generic predict function works", { + library(pammtools) + data("tumor", package = "pammtools") + ped <- pammtools::as_ped(tumor[1:100,], Surv(days, status) ~ . ) + mod <- xgboost.ped(ped, nrounds = 100, print_every_n = 50) + # S3 extension for stats::predict + pred <- predict(mod, newdata = tumor[101:103,]) + expect_true(all(pred > 0)) + expect_equal(round(pred * 100, 2), c(0.69, 0.03, 2.46)) + pred_sp <- predict(mod, newdata = tumor[101:103, ], type = "surv_prob") + expect_true(all(pred_sp >= 0 & pred_sp <= 1)) + expect_equal(round(pred_sp, 3), c(0, 0.998, 0)) + # S3 extension for pec::predictSurvProb + sp_mat <- predictSurvProb(mod, tumor[101:103, ], times = c(1, 500)) + expect_true(all(sp_mat >= 0 & sp_mat <=1)) + expect_equal(round(sp_mat[,1], 1), rep(1, 3)) + expect_equal(round(sp_mat[,2], 1), c(.7, .9, .8)) + +}) diff --git a/tests/testthat/test-xgb-fit.R b/tests/testthat/test-xgb-fit.R new file mode 100644 index 0000000..72560c8 --- /dev/null +++ b/tests/testthat/test-xgb-fit.R @@ -0,0 +1,36 @@ +context("Test fitting xgb models for survival data") + + +test_that("xgb models are fit to PED data", { + + data("tumor", package = "pammtools") + ped <- pammtools::as_ped(tumor[1:100,], Surv(days, status) ~ . ) + + ## xgboost + mod <- xgboost.ped(ped, nrounds = 100, print_every_n = 50) + expect_class(mod, c("pam_xgb", "xgb.Booster")) + expect_class(mod[["trafo_args"]], "list") + expect_identical(length(mod[["trafo_args"]]), 3L) + expect_identical(names(mod[["trafo_args"]]), c("formula", "cut", "id")) + + ## xgb.train + mod2 <- xgb.train.ped( + params = list( + max_depth = c(3, 5), + eta = 0.3 + ), + data = as_ped(tumor[1:100,], Surv(days, status)~.), + nrounds = 500L, + watchlist = list(eval = tumor[201:300, ] + ), + verbose = FALSE, + early_stopping_rounds = 50 + ) + + expect_class(mod2, c("pam_xgb", "xgb.Booster")) + expect_class(mod2[["trafo_args"]], "list") + expect_identical(length(mod2[["trafo_args"]]), 3L) + expect_identical(names(mod2[["trafo_args"]]), c("formula", "cut", "id")) + + +})