Variance Inflation Factor (#11)

* variance inflation factor and generalized variance inflation factor * tests + docstrings * patch bump * VIF in docs Co-authored-by: Alex Arslan <[email protected]>
palday · Feb 18, 2022 · a95cc85 · a95cc85 · palday · Feb 18, 2022
1 parent 24d0171
commit a95cc85
Show file tree

Hide file tree

Showing 9 changed files with 225 additions and 1 deletion.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -21,6 +21,11 @@ jobs:
         arch: [x64]
         os: [ubuntu-18.04] # macos-10.15, windows-2019
     steps:
+      - name: Cancel Previous Runs
+        uses: styfle/[email protected]
+        with:
+          all_but_latest: true
+          access_token: ${{ github.token }}
       - name: Checkout
         uses: actions/checkout@v2
       - name: Julia Setup

diff --git a/.github/workflows/documenter.yml b/.github/workflows/documenter.yml
@@ -11,6 +11,11 @@ jobs:
     name: Documentation
     runs-on: ubuntu-latest
     steps:
+      - name: Cancel Previous Runs
+        uses: styfle/[email protected]
+        with:
+          all_but_latest: true
+          access_token: ${{ github.token }}
       - uses: actions/checkout@v2
       - uses: julia-actions/julia-buildpkg@latest
       - uses: julia-actions/julia-docdeploy@latest

diff --git a/Project.toml b/Project.toml
@@ -1,16 +1,19 @@
 name = "MixedModelsExtras"
 uuid = "781a26e1-49f4-409a-8f4c-c3159d78c17e"
 authors = ["Phillip Alday <[email protected]> and contributors"]
-version = "0.1.2"
+version = "0.1.3"
 
 [deps]
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MixedModels = "ff71e718-51f3-5ec2-a782-8ffcbfa3c316"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
 
 [compat]
 MixedModels = "3, 4"
 StatsBase = "0.33"
+StatsModels = "0.6.28"
 julia = "1.6"
 
 [extras]

diff --git a/docs/src/api.md b/docs/src/api.md
@@ -24,3 +24,17 @@ adjr2
 ```@docs
 icc
 ```
+
+## Variance Inflation Factor
+
+```@docs
+vif
+```
+
+```@docs
+termnames
+```
+
+```@docs
+gvif
+```
diff --git a/src/MixedModelsExtras.jl b/src/MixedModelsExtras.jl
@@ -2,12 +2,16 @@ module MixedModelsExtras
 
 export icc
 export r², r2, adjr², adjr2
+export termnames, gvif, vif
 
+using LinearAlgebra
 using MixedModels
 using Statistics
 using StatsBase
+using StatsModels
 
 include("icc.jl")
 include("r2.jl")
+include("vif.jl")
 
 end
diff --git a/src/vif.jl b/src/vif.jl
@@ -0,0 +1,124 @@
+_rename_intercept(s) = strip(s) == "1" ? "(Intercept)" : s
+
+"""
+    termnames(model)
+
+Return the names associated with (fixed effects) terms in a model.
+
+For models with only continuous predictors, this is the same as
+`coefnames(model)`.
+For models with categorical predictors, the returned names reflect
+the categorical predictor and not the coefficients resulting from
+the choice of contrast coding.
+"""
+termnames(model) = _rename_intercept.(string.(_terms(model)))
+
+_terms(model) = collect(formula(model).rhs.terms)
+_terms(model::MixedModel) = collect(formula(model).rhs[1].terms)
+
+"""
+    vif(m::RegressionModel)
+
+Compute the variance inflation factor (VIF).
+
+Returns a vector of inflation factors computed for each coefficient except
+for the intercept.
+In other words, the corresponding coefficients are `coefnames(m)[2:end]`.
+
+The [variance inflation factor (VIF)](https://en.wikipedia.org/wiki/Variance_inflation_factor) measures
+the increase in the variance of a parameter's estimate in a model with multiple parameters relative to
+the variance of a parameter's estimate in a model containing only that parameter.
+
+See also [`coefnames`](@ref), [`gvif`](@ref).
+
+!!! warning
+    This method will fail if there is (numerically) perfect multicollinearity,
+    i.e. rank deficiency (in the fixed effects). In that case though, the VIF
+    isn't particularly informative anyway.
+"""
+function vif(m::RegressionModel)
+    mm = Statistics.cov2cor!(vcov(m), stderror(m))
+    all(==(1), view(modelmatrix(m), :, 1)) ||
+        throw(ArgumentError("VIF only defined for models with an intercept term"))
+    mm = @view mm[2:end, 2:end]
+    size(mm, 2) > 1 ||
+        throw(ArgumentError("VIF not meaningful for models with only one non-intercept term"))
+    # NB: The correlation matrix is positive definite and hence invertible
+    #     unless there is perfect rank deficiency, hence the warning.
+    # NB: The determinate technique for GVIF could also be applied
+    #     columnwise (instead of Term-wise) here, but it wouldn't really
+    #     be any more efficient because this is a Symmetric matrix and computing
+    #     all those determinants has its cost. The determinants also hint at
+    #     how you could show equivalency, if you remember that inversion is solving
+    #     a linear system and that Cramer's rule -- which uses determinants --
+    #     can also a linear system
+    # so we want diag(inv(mm)) but directly computing inverses is bad.
+    # that said, these are typically small-ish matrices and this is Simple.
+    return diag(inv(mm))
+end
+
+"""
+    gvif(m::RegressionModel; scale=false)
+
+Compute the generalized variance inflation factor (GVIF).
+
+If `scale=true`, then each GVIF is scaled by the degrees of freedom
+for (number of coefficients associated with) the predictor: ``GVIF^(1 / (2*df))``
+
+Returns a vector of inflation factors computed for each term except
+for the intercept.
+In other words, the corresponding coefficients are `termnames(m)[2:end]`.
+
+The [generalized variance inflation factor (VIF)](https://doi.org/10.2307/2290467)
+measures the increase in the variance of a (group of) parameter's estimate in a model
+with multiple parameters relative to the variance of a parameter's estimate in a
+model containing only that parameter. For continuous, numerical predictors, the GVIF
+is the same as the VIF, but for categorical predictors, the GVIF provides a single
+number for the entire group of contrast-coded coefficients associated with a categorical
+predictor.
+
+See also [`termnames`](@ref), [`vif`](@ref).
+
+!!! warning
+    This method will fail if there is (numerically) perfect multicollinearity,
+    i.e. rank deficiency (in the fixed effects). In that case though, the VIF
+    isn't particularly informative anyway.
+
+## References
+
+Fox, J., & Monette, G. (1992). Generalized Collinearity Diagnostics.
+Journal of the American Statistical Association, 87(417), 178. doi:10.2307/2290467
+"""
+function gvif(m::RegressionModel; scale=false)
+    mm = StatsBase.cov2cor!(vcov(m), stderror(m))
+
+    StatsModels.hasintercept(formula(m)) ||
+        throw(ArgumentError("GVIF only defined for models with an intercept term"))
+    mm = @view mm[2:end, 2:end]
+    size(mm, 2) > 1 ||
+        throw(ArgumentError("GVIF not meaningful for models with only one non-intercept term"))
+
+    tn = @view termnames(m)[2:end]
+    trms = @view _terms(m)[2:end]
+
+    df = width.(trms)
+    vals = zeros(eltype(mm), length(tn))
+    # benchmarking shows thad adding in Symmetric() here before det()
+    # actually slows things down a bit
+    detmm = det(mm)
+    acc = 0
+    for idx in axes(vals, 1)
+        wt = df[idx]
+        trm_only = [acc < i <= (acc + wt) for i in axes(mm, 2)]
+        trm_excl = .!trm_only
+        vals[idx] = det(view(mm, trm_only, trm_only)) *
+                    det(view(mm, trm_excl, trm_excl)) /
+                    detmm
+        acc += wt
+    end
+
+    if scale
+        vals .= vals .^ (1 ./ (2 .* df))
+    end
+    return vals
+end
diff --git a/test/Project.toml b/test/Project.toml
@@ -1,5 +1,9 @@
 [deps]
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MixedModels = "ff71e718-51f3-5ec2-a782-8ffcbfa3c316"
+RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -8,3 +8,7 @@ end
 @testset "r2" begin
     include("r2.jl")
 end
+
+@testset "VIF" begin
+    include("vif.jl")
+end
diff --git a/test/vif.jl b/test/vif.jl
@@ -0,0 +1,61 @@
+using GLM
+using LinearAlgebra
+using MixedModels
+using MixedModelsExtras
+using StatsBase
+using Test
+
+using MixedModels: dataset
+using RDatasets: dataset as rdataset
+
+progress = false
+
+@testset "LMM" begin
+    fm0 = fit(MixedModel, @formula(reaction ~ 0 + days + (1 | subj)), dataset(:sleepstudy);
+              progress)
+    fm1 = fit(MixedModel, @formula(reaction ~ 1 + days + (1 | subj)), dataset(:sleepstudy);
+              progress)
+    fm2 = fit(MixedModel, @formula(reaction ~ 1 + days * days^2 + (1 | subj)),
+              dataset(:sleepstudy); progress)
+
+    ae_int = ArgumentError("VIF only defined for models with an intercept term")
+    ae_nterm = ArgumentError("VIF not meaningful for models with only one non-intercept term")
+    @test_throws ae_int vif(fm0)
+    @test_throws ae_nterm vif(fm1)
+
+    mm = @view cor(modelmatrix(fm2))[2:end, 2:end]
+    # this is a slightly different way to do the same
+    # computation. I've verified that doing it this way
+    # in R gives the same answers as car::vif
+    # note that computing the same model from scratch in R
+    # gives different VIFs ([70.41934, 439.10096, 184.00107]),
+    # but that model is a slightly different fit than the Julia
+    # one and that has knock-on effects
+    @test isapprox(vif(fm2), diag(inv(mm)))
+
+    # since these are all continuous preds, should be the same
+    # but uses a very different computational method!
+    @test isapprox(vif(fm2), gvif(fm2))
+    # the scale factor is gvif^1/(2df)
+    # so just sqrt.(vif) when everything is continuous
+    @test isapprox(gvif(fm2; scale=true),
+                   sqrt.(gvif(fm2)))
+end
+
+@testset "GVIF and RegrssionModel" begin
+    duncan = rdataset("car", "Duncan")
+
+    lm1 = lm(@formula(Prestige ~ 1 + Income + Education), duncan)
+    @test termnames(lm1) == coefnames(lm1)
+    vif_lm1 = vif(lm1)
+
+    # values here taken from car
+    @test isapprox(vif_lm1, [2.1049, 2.1049]; atol=1e-5)
+    @test isapprox(vif_lm1, gvif(lm1))
+
+    lm2 = lm(@formula(Prestige ~ 1 + Income + Education + Type), duncan)
+    @test termnames(lm2) == ["(Intercept)", "Income", "Education", "Type"]
+    @test isapprox(gvif(lm2), [2.209178, 5.297584, 5.098592]; atol=1e-5)
+    @test isapprox(gvif(lm2; scale=true),
+                   [1.486330, 2.301648, 1.502666]; atol=1e-5)
+end