Skip to content

Commit

Permalink
Merge pull request #187 from MindSetLib/feat-monit
Browse files Browse the repository at this point in the history
Add homogeneity tests for feature monitoring and unittests for this code
  • Loading branch information
alexmindset authored Dec 3, 2022
2 parents 94d9a6f + 5f62c11 commit 1148fab
Show file tree
Hide file tree
Showing 9 changed files with 690,798 additions and 0 deletions.
15 changes: 15 additions & 0 deletions insolver/feature_monitoring/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from .homogeneity_tests import ContinuousHomogeneityTests
from .homogeneity_tests import DiscreteHomogeneityTests
from .homogeneity_tests import fillna_cont
from .homogeneity_tests import fillna_discr

from .psi_homogeneity_test import psi_discr_2samp
from .psi_homogeneity_test import psi_cont_2samp
from .psi_homogeneity_test import sec_min

from .chi2_homogeneity_test import chi2_discr_2samp

from .homogeneity_report import chart_cont
from .homogeneity_report import chart_discr
from .homogeneity_report import HomogeneityReport
from .homogeneity_report import render_report
62 changes: 62 additions & 0 deletions insolver/feature_monitoring/chi2_homogeneity_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import numpy as np
from scipy import stats as sps
from collections import defaultdict


class Chi2Result:
"""
This class is made for returning result of chi-square test in scipy style
(like a structure with two named fields).
Parameters:
statistic (float): value of counted chi-square statistic.
pvalue (float): pvalue corresponding to this statistic.
"""

def __init__(self, statistic: float, pvalue: float):
self.statistic = statistic
self.pvalue = pvalue


def chi2_discr_2samp(x1: np.ndarray, x2: np.ndarray) -> "Chi2Result":
"""
This function runs chi-square test checking homogeneity of two samples
of discrete variables.
Parameters:
x1 (np.array): sample from base period.
x2 (np.array): sample from current period.
Returns:
res (Chi2Result): object containing counted statistic and corresponding pvalue.
"""

n1, n2 = len(x1), len(x2)

# find unique categories and their frequencies in both arrays
cats1, counts1 = np.unique(x1, return_counts=True)
counts1 = defaultdict(int, zip(cats1, counts1))

cats2, counts2 = np.unique(x2, return_counts=True)
counts2 = defaultdict(int, zip(cats2, counts2))

cats = np.union1d(cats1, cats2)
num_cats = len(cats)

# if both samples consist of only one constant
# value we consider statistic to be zero
if num_cats == 1:
return Chi2Result(0.0, 1.0)

# calculate statistic
chi2 = 0.0
for cat in cats:
mu_i = counts1[cat]
nu_i = counts2[cat]
chi2 += ((mu_i / n1 - nu_i / n2) ** 2) / (mu_i + nu_i)
chi2 *= n1 * n2

# count pvalue
pvalue = 1 - sps.chi2.cdf(chi2, num_cats - 1)
res = Chi2Result(chi2, pvalue)
return res
Loading

0 comments on commit 1148fab

Please sign in to comment.