Inverse Gaussian distribution in alm(), #13

config-i1 · Dec 10, 2019 · 88dd74b · 88dd74b
1 parent 658fea7
commit 88dd74b
Show file tree

Hide file tree

Showing 5 changed files with 73 additions and 30 deletions.
diff --git a/NEWS b/NEWS
@@ -1,10 +1,10 @@
-greybox v0.5.7 (Release data: 2019-12-04)
+greybox v0.5.7 (Release data: 2019-12-10)
 ==============
 
 Changes:
 * plot.greybox() now also produces LOWESS lines on the scatterplots. There is a parameters that regulates this.
 * A bit of tuning of plot.greybox() graphs.
-* Inverse Gaussian distribution for alm() is now available. Not all the methods work okay yet. Predictions might be off.
+* Inverse Gaussian distribution for alm() is now available and works okay. Note that this is a non-conventional model. Check vignette for alm() function.
 * Legend for tableplot() is now by default FALSE.
 
 Bugfixes:

diff --git a/R/alm.R b/R/alm.R
@@ -422,8 +422,6 @@ alm <- function(formula, data, subset, na.action,
                                                     lambda=fitterReturn$other, log=TRUE),
                                 "dinvgauss" = dinvgauss(y[otU], mean=fitterReturn$mu[otU],
                                                         dispersion=fitterReturn$scale/fitterReturn$mu[otU], log=TRUE),
-                                # "dinvgauss" = dinvgauss(y[otU]/fitterReturn$mu[otU], mean=1, dispersion=fitterReturn$scale, log=TRUE),
-                                # "dinvgauss" = dinvgauss(y[otU], mean=fitterReturn$mu[otU], dispersion=fitterReturn$scale, log=TRUE),
                                 "dlaplace" = dlaplace(y[otU], mu=fitterReturn$mu[otU], scale=fitterReturn$scale, log=TRUE),
                                 "dalaplace" = dalaplace(y[otU], mu=fitterReturn$mu[otU], scale=fitterReturn$scale,
                                                         alpha=fitterReturn$other, log=TRUE),
@@ -447,7 +445,7 @@ alm <- function(formula, data, subset, na.action,
                                             "dfnorm" =,
                                             "dbcnorm" =,
                                             "dlnorm" = obsZero*(log(sqrt(2*pi)*fitterReturn$scale)+0.5),
-                                            "dinvgauss" = obsZero*0.5*(log(pi)+1-log(2/fitterReturn$scale)),
+                                            "dinvgauss" = obsZero*0.5*(log(pi)+1-suppressWarnings(log(2/fitterReturn$scale))),
                                             "dlaplace" =,
                                             "dalaplace" = obsZero*(1 + log(2*fitterReturn$scale)),
                                             "dlogis" = obsZero*2,
@@ -693,7 +691,7 @@ alm <- function(formula, data, subset, na.action,
              call.=FALSE);
     }
 
-    if(any(y==0) & any(distribution==c("dinvgauss"))){
+    if(any(y==0) & any(distribution==c("dinvgauss")) & !occurrenceModel){
         stop(paste0("Zero values are not allowed in the response variable for the distribution '",distribution,"'"),
              call.=FALSE);
     }

diff --git a/R/methods.R b/R/methods.R
@@ -692,7 +692,20 @@ predict.alm <- function(object, newdata=NULL, interval=c("none", "confidence", "
         greyboxForecast$scale <- sigma;
     }
     else if(object$distribution=="dinvgauss"){
-        greyboxForecast$scale <- greyboxForecast$variance / greyboxForecast$mean^3;
+        # This is a multiplicative model, so the variance is different
+        # if(interval=="p"){
+        #     residualsVariance <- sigma(object)^2;
+        #     greyboxForecast$variance[] <- greyboxForecast$variance - residualsVariance;
+        #     greyboxForecast$variance[] <- (greyboxForecast$variance * (residualsVariance+1) +
+        #                                        residualsVariance * greyboxForecast$mean^2);
+        # }
+        # greyboxForecast$scale <- greyboxForecast$variance / greyboxForecast$mean^3;
+        if(interval=="p"){
+            greyboxForecast$scale <- object$scale;
+        }
+        else if(interval=="c"){
+            greyboxForecast$scale <- greyboxForecast$variance / greyboxForecast$mean^3;
+        }
         if(interval!="n"){
             greyboxForecast$lower[] <- greyboxForecast$mean*qinvgauss(levelLow,mean=1,
                                                                       dispersion=greyboxForecast$scale);
@@ -1617,10 +1630,9 @@ plot.greybox <- function(x, which=c(1,2,4,6), level=0.95, legend=FALSE,
         }
 
         if(is.alm(x$occurrence)){
-            ellipsis$x <- ellipsis$x[ellipsis$y!=0];
-            ellipsis$y <- ellipsis$y[ellipsis$y!=0];
+            ellipsis$x <- ellipsis$x[actuals(x$occurrence)!=0];
+            ellipsis$y <- ellipsis$y[actuals(x$occurrence)!=0];
         }
-        ellipsis$y[] <- ellipsis$y;
 
         if(!any(names(ellipsis)=="main")){
             ellipsis$main <- paste0(yName," Residuals vs Fitted");
@@ -1648,11 +1660,12 @@ plot.greybox <- function(x, which=c(1,2,4,6), level=0.95, legend=FALSE,
                           "dlogis"=qlogis(c((1-level)/2, (1+level)/2), 0, 1),
                           "dt"=qt(c((1-level)/2, (1+level)/2), nobs(x)-nparam(x)),
                           "ds"=qs(c((1-level)/2, (1+level)/2), 0, 1),
-                          # The next one is not correct...
-                          "dinvgauss"=qinvgauss(c((1-level)/2, (1+level)/2), mean=1, dispersion=x$scale),
+                          # In the next one, the scale is debiased, taking n-k into account
+                          "dinvgauss"=qinvgauss(c((1-level)/2, (1+level)/2), mean=1,
+                                                dispersion=x$scale * nobs(x) / (nobs(x)-nparam(x))),
                           qnorm(c((1-level)/2, (1+level)/2), 0, 1));
-        outliers <- which(ellipsis$y >zValues[2] | ellipsis$y < zValues[1]);
-        cat(paste0(round(length(outliers)/length(ellipsis$y),3)*100,"% of values are outside the bounds\n"));
+        outliers <- which(ellipsis$y >zValues[2] | ellipsis$y <zValues[1]);
+        # cat(paste0(round(length(outliers)/length(ellipsis$y),3)*100,"% of values are outside the bounds\n"));
 
         if(!any(names(ellipsis)=="ylim")){
             ellipsis$ylim <- range(c(ellipsis$y,zValues));
@@ -1747,7 +1760,7 @@ plot.greybox <- function(x, which=c(1,2,4,6), level=0.95, legend=FALSE,
 
         ellipsis$y <- residuals(x);
         if(is.alm(x$occurrence)){
-            ellipsis$y <- ellipsis$y[ellipsis$y!=0];
+            ellipsis$y <- ellipsis$y[actuals(x$occurrence)!=0];
         }
 
         if(!any(names(ellipsis)=="xlab")){
@@ -2280,17 +2293,24 @@ rstandard.greybox <- function(model, ...){
     obs <- nobs(model);
     df <- obs - nparam(model);
     errors <- residuals(model);
+    # If this is an occurrence model, then only modify the non-zero obs
+    if(is.alm(model$occurrence)){
+        residsToGo <- actuals(model$occurrence)!=0;
+    }
+    else{
+        residsToGo <- c(1:obs);
+    }
     if(any(model$distribution==c("dt","dnorm","dlnorm","dbcnorm","dnbinom","dpois"))){
-        return((errors - mean(errors)) / sqrt(sum(residuals(model)^2) / df));
+        return((errors - mean(errors[residsToGo])) / sqrt(sum(residuals(model)^2) / df));
     }
     else if(model$distribution=="ds"){
-            return((errors - mean(errors)) / (model$scale * obs / df)^2);
+            return((errors - mean(errors[residsToGo])) / (model$scale * obs / df)^2);
     }
     else if(model$distribution=="dinvgauss"){
-            return(errors / mean(errors));
+            return(errors / mean(errors[residsToGo]));
     }
     else{
-        return((errors - mean(errors)) / model$scale * obs / df);
+        return((errors - mean(errors[residsToGo])) / model$scale * obs / df);
     }
 }
 
@@ -2301,39 +2321,46 @@ rstudent.greybox <- function(model, ...){
     df <- obs - nparam(model) - 1;
     rstudentised <- errors <- residuals(model);
     errors[] <- errors - mean(errors);
+    # If this is an occurrence model, then only modify the non-zero obs
+    if(is.alm(model$occurrence)){
+        residsToGo <- actuals(model$occurrence)!=0;
+    }
+    else{
+        residsToGo <- c(1:obs);
+    }
     if(any(model$distribution==c("dt","dnorm","dlnorm","dbcnorm"))){
-        for(i in 1:obs){
+        for(i in residsToGo){
             rstudentised[i] <- errors[i] / sqrt(sum(errors[-i]^2) / df)
         }
     }
     else if(model$distribution=="ds"){
-        for(i in 1:obs){
+        for(i in residsToGo){
             rstudentised[i] <- errors[i] / (sum(sqrt(abs(errors[-i]))) / (2*df))^2;
         }
     }
     else if(model$distribution=="dlaplace"){
-        for(i in 1:obs){
+        for(i in residsToGo){
             rstudentised[i] <- errors[i] / (sum(abs(errors[-i])) / df);
         }
     }
     else if(model$distribution=="dalaplace"){
-        for(i in 1:obs){
+        for(i in residsToGo){
             rstudentised[i] <- errors[i] / (sum(errors[-i] * (model$other$alpha - (errors[-i]<=0)*1)) / df);
         }
     }
     else if(model$distribution=="dlogis"){
-        for(i in 1:obs){
+        for(i in residsToGo){
             rstudentised[i] <- errors[i] / (sqrt(sum(errors[-i]^2) / df) * sqrt(3) / pi);
         }
     }
     else if(model$distribution=="dinvgauss"){
         errors[] <- residuals(model);
-        for(i in 1:obs){
-            rstudentised[i] <- errors[i] / mean(errors[-i]);
+        for(i in residsToGo){
+            rstudentised[i] <- errors[i] / mean(errors[residsToGo][-i]);
         }
     }
     else{
-        for(i in 1:obs){
+        for(i in residsToGo){
             rstudentised[i] <- errors[i] / sqrt(sum(errors[-i]^2) / df)
         }
     }

diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@ The package _greybox_ contains functions for model building, which is currently
 There are several groups of functions in the package.
 
 ### Regression model functions
-1. alm - advanced linear regression model that implements likelihood estimation of parameters for Normal, Laplace, Asymmetric Laplace, Logistic, Student's t, S, Folded Normal, Log Normal, Chi-Squared, Beta, Poisson, Negative Binomial, Cumulative Logistic and Cumulative Normal distributions. In a sense this is similar to `glm()` function, but with a different set of distributions and with a focus on forecasting.
+1. alm - advanced linear regression model that implements likelihood estimation of parameters for Normal, Laplace, Asymmetric Laplace, Logistic, Student's t, S, Folded Normal, Log Normal, Box-Cox Normal, Inverse Gaussian, Chi-Squared, Beta, Poisson, Negative Binomial, Cumulative Logistic and Cumulative Normal distributions. In a sense this is similar to `glm()` function, but with a different set of distributions and with a focus on forecasting.
 2. stepwise - function implements stepwise IC based on partial correlations.
 3. lmCombine - function combines the regression models from the provided data, based on IC weights and returns the combined alm object.
 

diff --git a/vignettes/alm.Rmd b/vignettes/alm.Rmd
@@ -240,8 +240,9 @@ This group includes:
 
 1. [Log Normal distribution](#lnormal),
 2. [Box-Cox Normal distribution](#bcnormal),
-3. [Folded Normal distribution](#fnormal),
-4. [Noncentral Chi Squared distribution](#chisq).
+3. [Inverse Gaussian distribution](#invgauss),
+4. [Folded Normal distribution](#fnormal),
+5. [Noncentral Chi Squared distribution](#chisq).
 
 Although (2) and (3) in theory allow having zeroes in data, given that the density function is equal to zero in any specific point, it will be zero in these cases as well. So the `alm()` will return some solutions for these distributions, but don't expect anything good. As for (1), it supports strictly positive data.
 
@@ -285,6 +286,23 @@ where $\epsilon_t \sim \mathcal{N}(0, \sigma^2)$ or:
 
 `alm()` with `distribution="dbcnorm"` does not transform the provided data and estimates the density directly using `dbcnorm()` function from `greybox` with the estimated mean $\mu_t$ and the variance \eqref{eq:sigmaBCNormal}. The $\mu_t$ is returned in the variable `mu`, the $\sigma^2$ is in the variable `scale`, while the `fitted.values` contains the exponent of $\mu_t$, which, given the connection between the Normal and Box-Cox Normal distributions, corresponds to median of distribution rather than mean. Finally, `resid()` method returns $e_t = \frac{y_t^{\lambda}-1}{\lambda} - \mu_t$.
 
+### Inverse Gaussian distribution {#invgauss}
+Inverse Gaussian distribution is an interesting distribution, which is defined for positive values only and has some properties similar to the properties of the Normal distribution. It has two parameters: location $\mu_t$ and scale $\phi$ (aka "dispersion"). There are different ways to parameterise this distribution, we use the dispersion-based one. The important thing that distincts the implementation in `alm()` from the one in `glm()` or in any other function is that we assume that the model has the following form:
+\begin{equation} \label{eq:InverseGaussianModel}
+    y_t = \mu_t \times \epsilon_t 
+\end{equation}
+and that $\epsilon_t \sim \mathcal{IG}(1, \phi)$. This means that $y_t \sim \mathcal{IG}\left(\mu_t, \frac{\phi}{\mu_t} \right)$, implying that the dispersion of the model changes together with the expectation. The density function for the error term in this case is:
+\begin{equation} \label{eq:InverseGaussian}
+    f(\epsilon_t) = \frac{1}{\sqrt{2 \pi \phi \epsilon_t^3}} \exp \left( -\frac{\left(\epsilon_t - 1 \right)^2}{2 \phi \epsilon_t} \right) ,
+\end{equation}
+where the dispersion parameter is estimated via maximising the likelihood and is calculated using:
+\begin{equation} \label{eq:InverseGaussianDispersion}
+    \hat{\phi} = \frac{1}{T} \sum_{t=1}^T \frac{\left(\epsilon_t - 1 \right)^2}{\epsilon_t} .
+\end{equation}
+One of the main issues in this formulation is that we are dealing with a mixed model, where $\mu_t$ is an additive term and $\epsilon_t$ is the multiplicative one. In many cases this should be okay, but in theory the model might fail in the holdout sample, when unobserved values of explanatory variables are used. In addition, we assume that $\mu_t$ is just a scale for the distribution, otherwise $y_t$ would not follow the Inverse Gaussian distribution.
+
+`alm()` with `distribution="dinvgauss"` estimates the density for $y_t$ using `dinvgauss()` function from `statmod` package. The $\mu_t$ is returned in the variables `mu` and `fitted.values`, the dispersion $\phi$ is in the variable `scale`. `resid()` method returns $e_t = \frac{y_t}{\mu_t}$. Finally, the prediction and confidence intervals for the regression model are generated using `qinvgauss()` function from the `statmod` package.
+
 ### Folded Normal distribution {#fnormal}
 Folded Normal distribution is obtained when the absolute value of normally distributed variable is taken: if $x \sim \mathcal{N}(\mu, \sigma^2)$, then $|x| \sim \text{folded }\mathcal{N}(\mu, \sigma^2)$. The density function is:
 \begin{equation} \label{eq:foldedNormal}