Skip to content

Commit

Permalink
1. norm and lnorm distributions, issue #13
Browse files Browse the repository at this point in the history
2. rmc, stepwise, lmCombine and lmDynamic now use alm().
  • Loading branch information
Ivan Svetunkov committed Aug 8, 2018
1 parent 375c9bc commit 54f2174
Show file tree
Hide file tree
Showing 17 changed files with 246 additions and 163 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: greybox
Type: Package
Title: Toolbox for Model Building and Forecasting
Version: 0.3.1.41003
Date: 2018-08-07
Version: 0.3.1.41004
Date: 2018-08-08
Authors@R: person("Ivan", "Svetunkov", email = "[email protected]", role = c("aut", "cre"),
comment="Lecturer at Centre for Marketing Analytics and Forecasting, Lancaster University, UK")
URL: https://github.com/config-i1/greybox
Expand Down
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ importFrom(stats,cor)
importFrom(stats,cov)
importFrom(stats,dchisq)
importFrom(stats,deltat)
importFrom(stats,dlnorm)
importFrom(stats,dnorm)
importFrom(stats,end)
importFrom(stats,fitted)
Expand All @@ -114,10 +115,10 @@ importFrom(stats,nobs)
importFrom(stats,optim)
importFrom(stats,optimize)
importFrom(stats,pchisq)
importFrom(stats,pf)
importFrom(stats,pnorm)
importFrom(stats,predict)
importFrom(stats,qchisq)
importFrom(stats,qlnorm)
importFrom(stats,qnorm)
importFrom(stats,qt)
importFrom(stats,residuals)
Expand Down
5 changes: 4 additions & 1 deletion NEWS
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
greybox v0.3.1 (Release data: 2018-08-07)
greybox v0.3.1 (Release data: 2018-08-08)
==============

Changes:
* Corrected some typos in README.md and added description of several functions.
* predict() and forecast() functions now produce confidence and prediction intervals for the provided holdout sample data. forecast() is just a wrapper around predict().
* Normal and log-normal distributions are now available in alm().
* rmc() now uses alm().
* stepwise(), lmCombine() and lmDynamic() can now also be constructed with distributions from alm().

Bugfixes:
* Fixed a bug with the style="line" in rmc(), where the grouping would be wrong in cases, when one method significantly differs from the others.
Expand Down
134 changes: 76 additions & 58 deletions R/alm.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,18 @@
#' This is a function, similar to \link[stats]{lm}, but for the cases of several
#' non-normal distributions. These include:
#' \enumerate{
#' \item Normal distribution, \link[stats]{dnorm},
#' \item Folded normal distribution, \link[greybox]{dfnorm},
#' \item Log normal distribution, \link[stats]{dlnorm},
#' \item Laplace distribution, \link[greybox]{dlaplace},
#' \item S-distribution, \link[greybox]{ds},
#' \item Folded-normal distribution, \link[greybox]{dfnorm},
#' \item Chi-Squared Distribution, \link[stats]{dchisq}.
#' }
#'
#' This function is slower than \code{lm}, because it relies on likelihood estimation
#' of parameters, hessian calculation and matrix multiplication. So think twice when
#' using \code{distribution="norm"} here.
#'
#' Probably some other distributions will be added to this function at some point...
#'
#' The estimation is done using likelihood of respective distributions.
Expand All @@ -31,7 +37,25 @@
#' @param distribution What density function to use in the process.
#'
#' @return Function returns \code{model} - the final model of the class
#' "alm".
#' "alm", which contains:
#' \itemize{
#' \item coefficients - estimated parameters of the model,
#' \item vcov - covariance matrix of parameters of the model (based on Fisher Information),
#' \item actuals - actual values of the response variable,
#' \item fitted.values - fitted values,
#' \item residuals - residuals of the model,
#' \item mu - the estimated location parameter of the distribution,
#' \item scale - the estimated scale parameter of the distribution,
#' \item distribution - distribution used in the estimation,
#' \item logLik - log-likelihood of the model,
#' \item df.residual - number of degrees of freedom of the residuals of the model,
#' \item df - number of degrees of freedom of the model,
#' \item call - how the model was called,
#' \item rank - rank of the model,
#' \item model - data on which the model was fitted,
#' \item qr - QR decomposition of the data,
#' \item terms - terms of the model.
#' }
#'
#' @seealso \code{\link[greybox]{stepwise}, \link[greybox]{lmCombine}}
#'
Expand All @@ -49,10 +73,10 @@
#'
#' @importFrom numDeriv hessian
#' @importFrom nloptr nloptr
#' @importFrom stats model.frame sd terms dchisq
#' @importFrom stats model.frame sd terms dchisq dlnorm dnorm
#' @export alm
alm <- function(formula, data, subset=NULL, na.action,
distribution=c("laplace","s","fnorm","chisq")){
distribution=c("norm","fnorm","lnorm","laplace","s","chisq")){

cl <- match.call();

Expand Down Expand Up @@ -142,51 +166,41 @@ alm <- function(formula, data, subset=NULL, na.action,

y <- as.matrix(dataWork[,1]);

fitter <- function(A, distribution, y, matrixXreg){
# if(distribution=="fnorm"){
# scale <- A[length(A)];
# A <- A[-length(A)];
# }

yFitted <- matrixXreg %*% A;
errors <- y - yFitted;

if(distribution=="laplace"){
scale <- mean(abs(y-yFitted));
}
else if(distribution=="s"){
scale <- mean(sqrt(abs(y-yFitted))) / 2;
}
else if(distribution=="fnorm"){
scale <- sqrt(mean((y-yFitted)^2));
ifelseFast <- function(condition, yes, no){
if(condition){
return(yes);
}
else if(distribution=="chisq"){
scale <- 2*yFitted;
else{
return(no);
}
}

fitter <- function(A, distribution, y, matrixXreg){
mu <- matrixXreg %*% A;

scale <- switch(distribution,
"norm"=,
"fnorm" = sqrt(mean((y-mu)^2)),
"lnorm"= sqrt(mean((log(y)-mu)^2)),
"laplace" = mean(abs(y-mu)),
"s" = mean(sqrt(abs(y-mu))) / 2,
"chisq" = 2*mu
);

return(list(yFitted=yFitted,scale=scale,errors=errors));
return(list(mu=mu,scale=scale));
}

CF <- function(A, distribution, y, matrixXreg){
fitterReturn <- fitter(A, distribution, y, matrixXreg);

if(distribution=="laplace"){
CFReturn <- dlaplace(y, mu=fitterReturn$yFitted, b=fitterReturn$scale, log=TRUE);
}
else if(distribution=="s"){
CFReturn <- ds(y, mu=fitterReturn$yFitted, b=fitterReturn$scale, log=TRUE);
}
else if(distribution=="fnorm"){
CFReturn <- dfnorm(y, mu=fitterReturn$yFitted, sigma=fitterReturn$scale, log=TRUE);
}
else if(distribution=="chisq"){
if(any(fitterReturn$yFitted<=0)){
CFReturn <- -1E+300;
}
else{
CFReturn <- dchisq(y, fitterReturn$yFitted, log=TRUE);
}
}
CFReturn <- switch(distribution,
"norm" = dnorm(y, mean=fitterReturn$mu, sd=fitterReturn$scale, log=TRUE),
"fnorm" = dfnorm(y, mu=fitterReturn$mu, sigma=fitterReturn$scale, log=TRUE),
"lnorm" = dlnorm(y, meanlog=fitterReturn$mu, sdlog=fitterReturn$scale, log=TRUE),
"laplace" = dlaplace(y, mu=fitterReturn$mu, b=fitterReturn$scale, log=TRUE),
"s" = ds(y, mu=fitterReturn$mu, b=fitterReturn$scale, log=TRUE),
"chisq" = ifelseFast(any(fitterReturn$mu<=0),-1E+300,dchisq(y, df=fitterReturn$mu, log=TRUE))
);

CFReturn <- -sum(CFReturn[is.finite(CFReturn)]);

Expand All @@ -197,12 +211,14 @@ alm <- function(formula, data, subset=NULL, na.action,
return(CFReturn);
}

A <- as.vector(solve(t(matrixXreg) %*% matrixXreg, tol=1e-10) %*% t(matrixXreg) %*% y);

# if(distribution=="fnorm"){
# A <- c(A,sd(y));
# }
if(distribution=="lnorm"){
A <- as.vector(solve(t(matrixXreg) %*% matrixXreg, tol=1e-10) %*% t(matrixXreg) %*% log(y));
}
else{
A <- as.vector(solve(t(matrixXreg) %*% matrixXreg, tol=1e-10) %*% t(matrixXreg) %*% y);
}

# Although this is not needed in case of distribution="norm", we do that in a way, for the code consistency purposes
res <- nloptr(A, CF,
opts=list("algorithm"="NLOPT_LN_SBPLX", xtol_rel=1e-8, maxeval=500),
distribution=distribution, y=y, matrixXreg=matrixXreg);
Expand All @@ -215,8 +231,7 @@ alm <- function(formula, data, subset=NULL, na.action,
CFValue <- res$objective;

fitterReturn <- fitter(A, distribution, y, matrixXreg);
yFitted <- fitterReturn$yFitted;
errors <- fitterReturn$errors;
mu <- fitterReturn$mu;
scale <- fitterReturn$scale;

# Parameters of the model + scale
Expand All @@ -228,7 +243,7 @@ alm <- function(formula, data, subset=NULL, na.action,
warning(paste0("Something went wrong and we failed to produce the covariance matrix of the parameters.\n",
"Obviously, it's not our fault. Probably Russians have hacked your computer...\n",
"Try a different distribution maybe?"), call.=FALSE);
vcovMatrix <- diag(nVariables+(distribution=="fnorm"));
vcovMatrix <- diag(nVariables);
}
else{
if(any(vcovMatrix==0)){
Expand All @@ -248,20 +263,22 @@ alm <- function(formula, data, subset=NULL, na.action,
}

if(distribution=="fnorm"){
# A <- A[-(nVariables+1)];
# Correction so that the the expectation of the folded is returned
mu <- yFitted;
# Calculate the conditional expectation based on the parameters of the distribution
yFitted <- sqrt(2/pi)*scale*exp(-yFitted^2/(2*scale^2))+yFitted*(1-2*pnorm(-yFitted/scale));
errors <- y - yFitted;
yFitted <- sqrt(2/pi)*scale*exp(-mu^2/(2*scale^2))+mu*(1-2*pnorm(-mu/scale));
vcovMatrix <- vcovMatrix[1:nVariables,1:nVariables];
}
else if(distribution=="lnorm"){
yFitted <- exp(mu);
}
else{
mu <- yFitted;
yFitted <- mu;
if(distribution=="chisq"){
scale <- yFitted * 2;
scale <- mu * 2;
}
}
errors <- y - yFitted;

names(A) <- variablesNames;
if(nVariables>1){
dimnames(vcovMatrix) <- list(variablesNames,variablesNames);
Expand All @@ -270,8 +287,9 @@ alm <- function(formula, data, subset=NULL, na.action,
names(vcovMatrix) <- variablesNames;
}

finalModel <- list(coefficients=A, vcov=vcovMatrix, residuals=as.vector(errors), fitted.values=yFitted, actuals=y, mu=mu,
df.residual=obsInsample-df, df=df, call=cl, rank=df, model=dataWork, scale=scale,
qr=qr(dataWork), terms=ourTerms, logLik=-CFValue, distribution=distribution);
finalModel <- list(coefficients=A, vcov=vcovMatrix, actuals=y, fitted.values=yFitted, residuals=as.vector(errors),
mu=mu, scale=scale, distribution=distribution, logLik=-CFValue,
df.residual=obsInsample-df, df=df, call=cl, rank=df, model=dataWork,
qr=qr(dataWork), terms=ourTerms);
return(structure(finalModel,class=c("alm","greybox")));
}
26 changes: 22 additions & 4 deletions R/lmCombine.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#' Function combines parameters of linear regressions of the first variable
#' on all the other provided data.
#'
#' The algorithm uses lm() to fit different models and then combines the models
#' The algorithm uses alm() to fit different models and then combines the models
#' based on the selected IC.
#'
#' @template AICRef
Expand All @@ -18,6 +18,7 @@
#' one are produced and then combined.
#' @param silent If \code{FALSE}, then nothing is silent, everything is printed
#' out. \code{TRUE} means that nothing is produced.
#' @param distribution Distribution to pass to \code{alm()}.
#'
#' @return Function returns \code{model} - the final model of the class
#' "lm.combined".
Expand Down Expand Up @@ -53,7 +54,8 @@
#'
#' @aliases combine combiner
#' @export lmCombine
lmCombine <- function(data, ic=c("AICc","AIC","BIC","BICc"), bruteForce=FALSE, silent=TRUE){
lmCombine <- function(data, ic=c("AICc","AIC","BIC","BICc"), bruteForce=FALSE, silent=TRUE,
distribution=c("norm","fnorm","lnorm","laplace","s","chisq")){
# Function combines linear regression models and produces the combined lm object.
ourData <- data;
if(!is.data.frame(ourData)){
Expand All @@ -79,6 +81,16 @@ lmCombine <- function(data, ic=c("AICc","AIC","BIC","BICc"), bruteForce=FALSE, s
IC <- BICc;
}

distribution <- distribution[1];
if(distribution=="norm"){
lmCall <- lm;
listToCall <- list(NULL);
}
else{
lmCall <- alm;
listToCall <- list(distribution=distribution);
}

# Observations in sample, assuming that the missing values are for the holdout
obsInsample <- sum(!is.na(ourData[,1]));
# Number of variables
Expand Down Expand Up @@ -110,7 +122,10 @@ lmCombine <- function(data, ic=c("AICc","AIC","BIC","BICc"), bruteForce=FALSE, s
parametersSE <- matrix(0,nCombinations,nVariables+1);

# Starting estimating the models with just a constant
ourModel <- lm(as.formula(paste0(responseName,"~1")),data=ourData);
# ourModel <- alm(as.formula(paste0(responseName,"~1")),data=ourData,distribution=distribution);
listToCall$formula <- as.formula(paste0(responseName,"~1"));
listToCall$data <- ourData;
ourModel <- do.call(lmCall,listToCall);
ICs[1] <- IC(ourModel);
parameters[1,1] <- coef(ourModel)[1];
parametersSE[1,1] <- diag(vcov(ourModel));
Expand Down Expand Up @@ -193,7 +208,10 @@ lmCombine <- function(data, ic=c("AICc","AIC","BIC","BICc"), bruteForce=FALSE, s
cat(paste0(round(i/nCombinations,2)*100,"%"));
}
lmFormula <- paste0(responseName,"~",paste0(variablesNames[variablesCombinations[i,]==1],collapse="+"));
ourModel <- lm(as.formula(lmFormula),data=ourData);
# ourModel <- alm(as.formula(lmFormula),data=ourData,distribution=distribution);
listToCall$formula <- as.formula(lmFormula);
listToCall$data <- ourData;
ourModel <- do.call(lmCall,listToCall);
ICs[i] <- IC(ourModel);
parameters[i,c(1,variablesCombinations[i,])==1] <- coef(ourModel);
parametersSE[i,c(1,variablesCombinations[i,])==1] <- diag(vcov(ourModel));
Expand Down
Loading

0 comments on commit 54f2174

Please sign in to comment.