From 16baa392ade30d5b81db6f025c9c5331c578f4f8 Mon Sep 17 00:00:00 2001 From: Jirayus Date: Mon, 25 Mar 2019 18:46:05 +1100 Subject: [PATCH] Address issue #9 - Implement a function to remove constant metrics and categorical metrics - Implement a function to get correlation groups according to VarClus --- NAMESPACE | 1 + R/AutoSpearman.R | 27 ++++++++------------------ R/check.constant.categorical.R | 32 +++++++++++++++++++++++++++++++ R/get.vc.R | 4 ++++ R/get.vc.correlation.groups.R | 30 +++++++++++++++++++++++++++++ man/check.constant.categorical.Rd | 22 +++++++++++++++++++++ man/get.vc.correlation.groups.Rd | 22 +++++++++++++++++++++ 7 files changed, 119 insertions(+), 19 deletions(-) create mode 100644 R/check.constant.categorical.R create mode 100644 R/get.vc.correlation.groups.R create mode 100644 man/check.constant.categorical.Rd create mode 100644 man/get.vc.correlation.groups.Rd diff --git a/NAMESPACE b/NAMESPACE index 77799df..e3d370b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ S3method(plot,summarizedvc) export(AutoSpearman) +export(check.constant.categorical) export(fit) export(get.importance) export(loadDefectDataset) diff --git a/R/AutoSpearman.R b/R/AutoSpearman.R index e4afac4..32ddc15 100644 --- a/R/AutoSpearman.R +++ b/R/AutoSpearman.R @@ -12,7 +12,7 @@ #' @param vif.threshold a numeric for a threshold of VIF score (default = 5) #' @param verbose TRUE for printing #' @keywords AutoSpearman -#' @examples +#' @examples #' Data = loadDefectDataset('groovy-1_5_7','jira') #' AutoSpearman(dataset = Data$data, metrics = Data$indep) #' @export @@ -22,25 +22,14 @@ AutoSpearman <- spearman.threshold = 0.7, vif.threshold = 5, verbose = F) { + # Check constant metrics and categorical metrics + metrics <- check.constant.categorical(dataset, metrics) - # Check constant metrics - constant <- apply(dataset[, metrics], 2, function(x) max(x) == min(x)) - constant <- names(constant[constant == TRUE]) - # Remove constant metrics - if(length(constant) > 0){ - metrics <- metrics[!metrics %in% constant] - } - # Check categorical metrics - category <- sapply(dataset[, metrics], class) - category <- names(category[category=="character"]) - # Remove categorical metrics from Spearman Analysis - if(length(category) > 0){ - metrics <- metrics[!metrics %in% category] - } + spearman.metrics <- + get.automated.spearman(dataset, metrics, spearman.threshold, verbose) + AutoSpearman.metrics <- + stepwise.vif(dataset, spearman.metrics, vif.threshold, verbose) - spearman.metrics <- get.automated.spearman(dataset, metrics, spearman.threshold, verbose) - AutoSpearman.metrics <- stepwise.vif(dataset, spearman.metrics, vif.threshold, verbose) - - return(c(AutoSpearman.metrics,category)) + return(AutoSpearman.metrics) } diff --git a/R/check.constant.categorical.R b/R/check.constant.categorical.R new file mode 100644 index 0000000..ddca673 --- /dev/null +++ b/R/check.constant.categorical.R @@ -0,0 +1,32 @@ +#' Check for constant metrics and categorical metrics +#' +#' @param dataset a data frame for data +#' @param metrics a characters or a vector of characters for independent variables +#' @keywords constant categorical +#' @examples +#' Data = loadDefectDataset('groovy-1_5_7','jira') +#' check.constant.categorical(dataset = Data$data, metrics = Data$indep) +#' @export +check.constant.categorical <- + function(dataset, + metrics) { + # Check constant metrics + constant <- + apply(dataset[, metrics], 2, function(x) + max(x) == min(x)) + constant <- names(constant[constant == TRUE]) + # Remove constant metrics + if (length(constant) > 0) { + metrics <- metrics[!metrics %in% constant] + } + + # Check categorical metrics + category <- sapply(dataset[, metrics], class) + category <- names(category[category == "character"]) + # Remove categorical metrics from Spearman Analysis + if (length(category) > 0) { + metrics <- metrics[!metrics %in% category] + } + + return(metrics) + } \ No newline at end of file diff --git a/R/get.vc.R b/R/get.vc.R index 772e3a0..6a59838 100644 --- a/R/get.vc.R +++ b/R/get.vc.R @@ -9,6 +9,10 @@ #' @keywords VarClus get.vc <- function(dataset, metrics, similarity = 'spearman', varclus.threshold = 0.7){ + + # Check constant metrics and categorical metrics + metrics <- check.constant.categorical(dataset, metrics) + f <- as.formula(paste("~", paste(metrics, collapse = " + "))) vc <- Hmisc::varclus(f, diff --git a/R/get.vc.correlation.groups.R b/R/get.vc.correlation.groups.R new file mode 100644 index 0000000..14dc7fe --- /dev/null +++ b/R/get.vc.correlation.groups.R @@ -0,0 +1,30 @@ +#' Get correlation groups according to VarClus based on the absolute Spearman correlation coefficients between metrics +#' +#' This function makes life simple by providing a VarClus. +#' @param dataset a data frame for data +#' @param metrics a vector of characters or a vector of characters for independent variables +#' @param similarity a character for similarity measures (e.g., Spearman rank correlation), default = spearman +#' @param varclus.threshold a numeric for correlation coefficient threshold value +#' @importFrom Hmisc varclus +#' @keywords VarClus + +get.vc.correlation.groups <- function(dataset, metrics, similarity = 'spearman', varclus.threshold = 0.7){ + + # Check constant metrics and categorical metrics + metrics <- check.constant.categorical(dataset, metrics) + + f <- as.formula(paste("~", paste(metrics, collapse = " + "))) + vc <- + Hmisc::varclus(f, + similarity = similarity, + data = dataset[, metrics], + trans = "abs") + + var.clusters <- + cutree(vc$hclust, h = (1 - varclus.threshold)) + melted.data <- melt(var.clusters) + varclus.correlation.groups <- data.frame(metrics = row.names(melted.data), rank = var.clusters) + row.names(varclus.correlation.groups) <- NULL + + return(varclus.correlation.groups) +} \ No newline at end of file diff --git a/man/check.constant.categorical.Rd b/man/check.constant.categorical.Rd new file mode 100644 index 0000000..b4c45cf --- /dev/null +++ b/man/check.constant.categorical.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check.constant.categorical.R +\name{check.constant.categorical} +\alias{check.constant.categorical} +\title{Check for constant metrics and categorical metrics} +\usage{ +check.constant.categorical(dataset, metrics) +} +\arguments{ +\item{dataset}{a data frame for data} + +\item{metrics}{a characters or a vector of characters for independent variables} +} +\description{ +Check for constant metrics and categorical metrics +} +\examples{ +Data = loadDefectDataset('groovy-1_5_7','jira') +check.constant.categorical(dataset = Data$data, metrics = Data$indep) +} +\keyword{categorical} +\keyword{constant} diff --git a/man/get.vc.correlation.groups.Rd b/man/get.vc.correlation.groups.Rd new file mode 100644 index 0000000..c729938 --- /dev/null +++ b/man/get.vc.correlation.groups.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get.vc.correlation.groups.R +\name{get.vc.correlation.groups} +\alias{get.vc.correlation.groups} +\title{Get correlation groups according to VarClus based on the absolute Spearman correlation coefficients between metrics} +\usage{ +get.vc.correlation.groups(dataset, metrics, similarity = "spearman", + varclus.threshold = 0.7) +} +\arguments{ +\item{dataset}{a data frame for data} + +\item{metrics}{a vector of characters or a vector of characters for independent variables} + +\item{similarity}{a character for similarity measures (e.g., Spearman rank correlation), default = spearman} + +\item{varclus.threshold}{a numeric for correlation coefficient threshold value} +} +\description{ +This function makes life simple by providing a VarClus. +} +\keyword{VarClus}