Skip to content

Commit aef768d

Browse files
committed
updates in MAD method, logger implemented, cleaning code, updating examples
1 parent a23ae64 commit aef768d

6 files changed

Lines changed: 203 additions & 148 deletions

File tree

benfordslaw/__init__.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,21 @@
11
from benfordslaw.benfordslaw import benfordslaw
22
from benfordslaw.benfordslaw import compute_excess_mad
3-
from benfordslaw.benfordslaw import EXCESS_MAD_CONSTANTS
3+
import logging
44

55
__author__ = 'Erdogan Tasksen'
66
__email__ = 'erdogant@gmail.com'
7-
__version__ = '2.0.2'
7+
__version__ = '2.1.0'
8+
9+
# Setup root logger
10+
_logger = logging.getLogger('benfordslaw')
11+
_log_handler = logging.StreamHandler()
12+
_fmt = '[{asctime}] [{name}] [{levelname}] {message}'
13+
_formatter = logging.Formatter(fmt=_fmt, style='{', datefmt='%d-%m-%Y %H:%M:%S')
14+
_log_handler.setFormatter(_formatter)
15+
_log_handler.setLevel(logging.DEBUG)
16+
_logger.addHandler(_log_handler)
17+
_logger.propagate = False
18+
819

920
# module level doc-string
1021
__doc__ = """
@@ -15,7 +26,7 @@
1526
The law states that in many naturally occurring collections of numbers, the leading significant digit is likely to be small.
1627
This method can be used if you want to test whether your set of numbers may be artificial (or manipulated).
1728
18-
New in version 2.0.2:
29+
New in version 2.1.0:
1930
- Added Excess MAD (Mean Absolute Deviation) statistic for more reliable fraud detection
2031
- Added first-two-digits test (pos='first_two') for higher resolution analysis
2132
- Added 'mad' method option for MAD-based conformity assessment
@@ -25,7 +36,7 @@
2536
>>> # Import library
2637
>>> from benfordslaw import benfordslaw
2738
>>> #
28-
>>> # Initialize with MAD method (recommended for fraud detection)
39+
>>> # Initialize with MAD method
2940
>>> bl = benfordslaw(pos='first_two', method='mad')
3041
>>> #
3142
>>> df = bl.import_example()
@@ -37,15 +48,15 @@
3748
>>> #
3849
>>> # Access Excess MAD results
3950
>>> print(f"Excess MAD: {results['excess_mad']}")
40-
>>> print(f"Conformity: {results['conformity']}")
51+
>>> print(f"Conformity: {results['conformity_mad']}")
4152
>>> #
4253
>>> # Figure
4354
>>> fig, ax = bl.plot()
4455
4556
Quick Excess MAD Computation
4657
----------------------------
4758
>>> from benfordslaw import compute_excess_mad
48-
>>> result = compute_excess_mad(data, pos='first_two')
59+
>>> result = compute_excess_mad(X, pos='first_two')
4960
>>> print(f"Excess MAD: {result['excess_mad']}")
5061
5162
References

benfordslaw/benfordslaw.py

Lines changed: 115 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -15,29 +15,22 @@
1515
from scipy.stats import combine_pvalues
1616
import matplotlib.pyplot as plt
1717
import math
18+
import logging
1819

19-
20-
# %% Constants for Excess MAD calculation
21-
# These constants are derived from the variance of the binomial distribution for each digit test.
22-
# For the first-two-digits test, C = 158.8 as derived in Barney & Schulzke (2016).
23-
# For other tests, constants are computed using the formula:
24-
# C = K^2 * π / (2 * (Σ sqrt(p_k * (1 - p_k)))^2)
25-
# where K is the number of digit categories and p_k are the Benford probabilities.
26-
EXCESS_MAD_CONSTANTS = {
27-
'first_two': 158.8, # First-two-digits test (90 categories, k=10..99)
28-
1: 21.27, # First digit test (9 categories, k=1..9)
29-
2: 30.30, # Second digit test (10 categories, k=0..9)
30-
3: 31.83, # Third digit test (10 categories, approximately uniform)
31-
}
32-
20+
logger = logging.getLogger(__name__)
3321

3422
# %% Class
3523
class benfordslaw:
3624
"""Class benfordslaw."""
3725

38-
def __init__(self, alpha=0.05, method='chi2', pos=1, verbose=3):
26+
def __init__(self, alpha: float = 0.05, method: str = 'chi2', pos: int = 1, verbose: [str, int] = 'info',):
3927
"""Initialize benfordslaw with user-defined parameters.
4028
29+
Constants for Excess MAD calculation.
30+
The constants are derived from the variance of the binomial distribution for each digit test.
31+
For the first-two-digits test, C = 158.8 as derived in Barney & Schulzke (2016).
32+
For other tests, constants are computed using the formula: C = K^2 * π / (2 * (Σ sqrt(p_k * (1 - p_k)))^2), where K is the number of digit categories and p_k are the Benford probabilities.
33+
4134
Parameters
4235
----------
4336
X : list or numpy array
@@ -58,19 +51,33 @@ def __init__(self, alpha=0.05, method='chi2', pos=1, verbose=3):
5851
* -2: second last digit (etc.)
5952
* 'first_two': First two digits combined (recommended for Benford's Law analysis,
6053
provides higher resolution with 90 categories instead of 9)
61-
verbose : int, optional
62-
Print message to screen. The default is 3.
54+
verbose : str or int, optional, default='info' (20)
55+
Logging verbosity level. Possible values:
56+
- 0, 60, None, 'silent', 'off', 'no' : no messages.
57+
- 10, 'debug' : debug level and above.
58+
- 20, 'info' : info level and above.
59+
- 30, 'warning' : warning level and above.
60+
- 50, 'critical' : critical level and above.
6361
6462
References
6563
----------
6664
* Barney, B. J., & Schulzke, K. S. (2016). Moderating "Cry Wolf" Events with Excess MAD
6765
in Benford's Law Research and Practice. Journal of Forensic Accounting Research, 1(1), A66-A90.
6866
6967
"""
68+
# Set the logger
69+
verbose = set_logger(verbose=verbose)
70+
7071
if (alpha is None): alpha = 1
7172
self.alpha = alpha
7273
self.method = method
7374
self.pos = pos
75+
self.EXCESS_MAD_CONSTANTS = {
76+
'first_two': 158.8, # First-two-digits test (90 categories, k=10..99)
77+
1: 21.27, # First digit test (9 categories, k=1..9)
78+
2: 30.30, # Second digit test (10 categories, k=0..9)
79+
3: 31.83, # Third digit test (10 categories, approximately uniform)
80+
}
7481
self.verbose = verbose
7582

7683
# Benford's Law percentage-distribution for leading digits
@@ -89,9 +96,10 @@ def __init__(self, alpha=0.05, method='chi2', pos=1, verbose=3):
8996
self.leading_digits = [10.2, 10.1, 10.1, 10.1, 10.0, 10.0, 9.9, 9.9, 9.9, 9.8]
9097
self.digit_range = range(0, 10)
9198
elif pos == 0:
92-
raise Exception('[benfordslaw] >There is no leading digit distribution for the 0 digit!')
99+
logger.error("There is no leading digit distribution for the 0 digit!")
100+
raise ValueError("There is no leading digit distribution for the 0 digit!")
93101
elif isinstance(pos, int) and (pos > 3 or pos < 0):
94-
if verbose >= 3: print(f'[benfordslaw] >The is no leading digit distribution explicitly specified for digit [{pos}] and therefore the Uniform distribution is used instead.')
102+
logger.info(f'[benfordslaw] >The is no leading digit distribution explicitly specified for digit [{pos}] and therefore the Uniform distribution is used instead.')
95103
# Approximation, near-uniform distribution
96104
self.leading_digits = [10.0] * 10
97105
self.digit_range = range(0, 10)
@@ -120,7 +128,7 @@ def fit(self, X):
120128
>>> # Import library
121129
>>> from benfordslaw import benfordslaw
122130
>>> #
123-
>>> # Initialize with MAD method (recommended for fraud detection)
131+
>>> # Initialize with MAD method
124132
>>> bl = benfordslaw(pos='first_two', method='mad')
125133
>>> #
126134
>>> # Get data for one candidate
@@ -154,7 +162,7 @@ def fit(self, X):
154162
excess_mad : float
155163
Excess MAD = MAD - E(MAD). Negative values indicate conformity,
156164
positive values indicate deviation from Benford's Law.
157-
conformity : str
165+
conformity_mad : str
158166
Conformity assessment based on MAD thresholds ('close conformity',
159167
'acceptable conformity', 'marginally acceptable conformity', or 'nonconforming').
160168
N : int
@@ -169,7 +177,8 @@ def fit(self, X):
169177
170178
"""
171179
# Make distribution first digits
172-
if self.verbose >= 3: print("[benfordslaw] >Analyzing digit position: [%s]" % (self.pos))
180+
logger.info(f"Analyzing digit position: {self.pos}")
181+
self.results = {}
173182
# Convert pandas dataframe to numpy array
174183
if isinstance(X, pd.DataFrame): X = X.values.ravel()
175184
# Count digit based on position type
@@ -206,30 +215,30 @@ def fit(self, X):
206215

207216
# Show message
208217
if self.method == 'mad':
209-
if self.verbose >= 3:
210-
if excess_mad <= 0:
211-
print("[benfordslaw] >[mad] No anomaly detected. Excess MAD=%g (%s)" % (excess_mad, conformity))
212-
else:
213-
print("[benfordslaw] >[mad] Potential anomaly. Excess MAD=%g (%s)" % (excess_mad, conformity))
214-
elif np.isnan(Praw) and (self.verbose >= 3):
215-
print(f"[benfordslaw] >No data available for this position.")
216-
elif (Praw <= self.alpha) and (self.verbose >= 3):
217-
print("[benfordslaw] >[%s] Anomaly detected! P=%g, Tstat=%g" % (self.method, Praw, tstats))
218-
elif (Praw > self.alpha) and self.verbose >= 3:
219-
print("[benfordslaw] >[%s] No anomaly detected. P=%g, Tstat=%g" % (self.method, Praw, tstats))
220-
218+
logger.info(f"[{self.method}] {'No anomaly detected' if excess_mad <= 0 else 'Potential anomaly'}. Excess MAD={excess_mad} ({conformity})")
219+
elif np.isnan(Praw):
220+
logger.info("No data available for this position.")
221+
elif (Praw <= self.alpha):
222+
logger.info(f"[{self.method}] Anomaly detected! P={Praw}, Tstat={tstats}")
223+
elif (Praw > self.alpha):
224+
logger.info(f"[{self.method}] No anomaly detected. P={Praw}, Tstat={tstats}")
225+
226+
# Set bool based on selected method
227+
if self.method == 'mad':
228+
self.results['P_significant'] = excess_mad > 0
229+
else:
230+
self.results['P_significant'] = Praw <= self.alpha
231+
221232
# Store
222-
self.results = {}
233+
self.results['N'] = int(total_count)
223234
self.results['P'] = Praw
224235
self.results['t'] = tstats
225-
self.results['P_significant'] = Praw <= self.alpha if not np.isnan(Praw) else (excess_mad > 0)
226236
self.results['percentage_emp'] = np.c_[digit, percentage_emp]
227237
# Always include MAD statistics
228238
self.results['mad'] = mad
229239
self.results['expected_mad'] = expected_mad
230240
self.results['excess_mad'] = excess_mad
231-
self.results['conformity'] = conformity
232-
self.results['N'] = int(total_count)
241+
self.results['conformity_mad'] = conformity
233242

234243
# return
235244
return self.results
@@ -339,9 +348,9 @@ def _get_excess_mad_constant(self):
339348
340349
"""
341350
if self.pos == 'first_two':
342-
return EXCESS_MAD_CONSTANTS['first_two']
343-
elif self.pos in EXCESS_MAD_CONSTANTS:
344-
return EXCESS_MAD_CONSTANTS[self.pos]
351+
return self.EXCESS_MAD_CONSTANTS['first_two']
352+
elif self.pos in self.EXCESS_MAD_CONSTANTS:
353+
return self.EXCESS_MAD_CONSTANTS[self.pos]
345354
else:
346355
# For other positions, use an approximation based on the distribution
347356
# C ≈ K² × π / 2 for approximately uniform distributions
@@ -392,7 +401,7 @@ def plot(self, title='', fontsize=16, barcolor='black', barwidth=0.3, label='Emp
392401

393402
# Build title based on method
394403
if self.method == 'mad':
395-
title = title + "\nExcess MAD=%g (%s)" % (self.results['excess_mad'], self.results['conformity'])
404+
title = title + "\nExcess MAD=%g (%s)" % (self.results['excess_mad'], self.results['conformity_mad'])
396405
elif not np.isnan(self.results['P']) and self.results['P'] <= self.alpha:
397406
title = title + "\nAnomaly detected! P=%g, Tstat=%g" % (self.results['P'], self.results['t'])
398407
elif not np.isnan(self.results['P']):
@@ -429,7 +438,7 @@ def plot(self, title='', fontsize=16, barcolor='black', barwidth=0.3, label='Emp
429438
plt.show()
430439
return fig, ax
431440

432-
def import_example(self, data='elections', url=None, sep=',', verbose=3):
441+
def import_example(self, data='elections', url=None, sep=',', verbose='info'):
433442
"""Import example dataset from github source.
434443
435444
Import one of the few datasets from github source or specify your own download url link.
@@ -622,17 +631,77 @@ def compute_excess_mad(data, pos='first_two'):
622631
in Benford's Law Research and Practice. Journal of Forensic Accounting Research, 1(1), A66-A90.
623632
624633
"""
625-
bl = benfordslaw(pos=pos, method='mad', verbose=0)
634+
bl = benfordslaw(pos=pos, method='mad', verbose='info')
626635
results = bl.fit(np.asarray(data))
627636
return {
628637
'mad': results['mad'],
629638
'expected_mad': results['expected_mad'],
630639
'excess_mad': results['excess_mad'],
631-
'conformity': results['conformity'],
640+
'conformity_mad': results['conformity_mad'],
632641
'N': results['N']
633642
}
634643

635644

645+
646+
# %%
647+
def set_logger(verbose: [str, int] = 'info', return_status: bool = False):
648+
"""Set the logger for verbosity messages.
649+
650+
Parameters
651+
----------
652+
verbose : [str, int], default is 'info' or 20
653+
Set the verbose messages using string or integer values.
654+
* 0, 60, None, 'silent', 'off', 'no']: No message.
655+
* 10, 'debug': Messages from debug level and higher.
656+
* 20, 'info': Messages from info level and higher.
657+
* 30, 'warning': Messages from warning level and higher.
658+
* 50, 'critical': Messages from critical level and higher.
659+
660+
Returns
661+
-------
662+
None.
663+
664+
Examples
665+
--------
666+
>>> # Set the logger to warning
667+
>>> set_logger(verbose='warning')
668+
>>>
669+
>>> # Test with different messages
670+
>>> logger.debug("Hello debug")
671+
>>> logger.info("Hello info")
672+
>>> logger.warning("Hello warning")
673+
>>> logger.critical("Hello critical")
674+
>>>
675+
"""
676+
# Set 0 and None as no messages.
677+
if (verbose==0) or (verbose is None):
678+
verbose=60
679+
# Convert str to levels
680+
if isinstance(verbose, str):
681+
levels = {
682+
'silent': logging.CRITICAL + 10,
683+
'off': logging.CRITICAL + 10,
684+
'no': logging.CRITICAL + 10,
685+
'debug': logging.DEBUG,
686+
'info': logging.INFO,
687+
'warning': logging.WARNING,
688+
'error': logging.ERROR,
689+
'critical': logging.CRITICAL,
690+
}
691+
verbose = levels[verbose]
692+
693+
# Show examples
694+
logger.setLevel(verbose)
695+
if return_status:
696+
return verbose
697+
698+
699+
# %%
700+
def get_logger():
701+
return logger.getEffectiveLevel()
702+
703+
636704
# %% Main
637705
if __name__ == "__main__":
638-
print('[benfordslaw] >Please bootup python and run benfordslaw as described in the readme file: https://github.com/erdogant/benfordslaw')
706+
logger.info(f'Please bootup python and run benfordslaw as described in the readme file: https://github.com/erdogant/benfordslaw')
707+

benfordslaw/examples.py

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,27 @@
1+
# %% Excess MAD - sample size adjusted conformity measure
2+
# Reference: Barney & Schulzke (2016), Journal of Forensic Accounting Research
3+
from benfordslaw import benfordslaw, compute_excess_mad
4+
5+
# First-two-digits test with MAD method (recommended for fraud detection)
6+
bl = benfordslaw(pos='first_two', method='mad')
7+
df = bl.import_example(data='elections_usa')
8+
X = df['votes'].loc[df['candidate'] == 'Donald Trump'].values
9+
10+
results = bl.fit(X)
11+
print(f"MAD: {results['mad']:.6f}")
12+
print(f"Expected MAD: {results['expected_mad']:.6f}")
13+
print(f"Excess MAD: {results['excess_mad']:.6f}") # Negative = good conformity
14+
print(f"Conformity: {results['conformity_mad']}")
15+
16+
bl.plot(title='Excess MAD Analysis - Donald Trump')
17+
18+
# Quick computation using convenience function
19+
quick_result = compute_excess_mad(X, pos='first_two')
20+
print(f"Quick Excess MAD: {quick_result['excess_mad']:.6f}")
21+
22+
# %%
23+
24+
125
"""Examples for benfords law."""
226

327
# import benfordslaw
@@ -156,7 +180,7 @@
156180
bl.plot(title='Donald Trump', barcolor=[0.5, 0.5, 0.5], fontsize=12, barwidth=0.4)
157181

158182
# %% RUS
159-
df = bl.import_example('RUS')
183+
df = bl.import_example('elections_rus')
160184
candidates=['Putin Vladimir Vladimirovich', 'Baburin Sergei Nikolaevich', 'Titov Boris Yurievich', 'Yavlinskiy Gregory Alekseivich']
161185

162186
for candidate in candidates:
@@ -172,25 +196,6 @@
172196
bl.fit(X)
173197
bl.plot(title=candidate)
174198

175-
# %% Excess MAD - sample size adjusted conformity measure
176-
# Reference: Barney & Schulzke (2016), Journal of Forensic Accounting Research
177-
from benfordslaw import benfordslaw, compute_excess_mad
178-
179-
# First-two-digits test with MAD method (recommended for fraud detection)
180-
bl = benfordslaw(pos='first_two', method='mad')
181-
df = bl.import_example(data='elections_usa')
182-
X = df['votes'].loc[df['candidate'] == 'Donald Trump'].values
183-
184-
results = bl.fit(X)
185-
print(f"MAD: {results['mad']:.6f}")
186-
print(f"Expected MAD: {results['expected_mad']:.6f}")
187-
print(f"Excess MAD: {results['excess_mad']:.6f}") # Negative = good conformity
188-
print(f"Conformity: {results['conformity']}")
189-
190-
bl.plot(title='Excess MAD Analysis - Donald Trump')
191199

192-
# Quick computation using convenience function
193-
quick_result = compute_excess_mad(X, pos='first_two')
194-
print(f"Quick Excess MAD: {quick_result['excess_mad']:.6f}")
195200

196201
# %%

0 commit comments

Comments
 (0)