From a8d92722481bc85364ca1e90c3dc5470777727df Mon Sep 17 00:00:00 2001 From: Max Baak Date: Mon, 10 Jan 2022 22:52:54 +0100 Subject: [PATCH 1/3] Initial version of RollingInputFixedReference New pipeline for: - rolling input histograms - and fixed (external) histograms --- popmon/analysis/comparison/hist_comparer.py | 87 +++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/popmon/analysis/comparison/hist_comparer.py b/popmon/analysis/comparison/hist_comparer.py index 67a5b224..0b82924e 100644 --- a/popmon/analysis/comparison/hist_comparer.py +++ b/popmon/analysis/comparison/hist_comparer.py @@ -360,6 +360,93 @@ def transform(self, datastore): return super().transform(datastore) +class RollingInputFixedReference(Pipeline): + """Base pipeline to compare rolling input histograms to fixed (external) histograms""" + + def __init__( + self, + read_key, + reference_key, + store_key, + assign_to_key=None, + window=1, + shift=1, + hist_col="histogram", + suffix1="roll", + suffix2="ref", + prefix="rollref", + max_res_bound=7.0, + ): + """Initialize an instance of RollingInputFixedReference. + + :param str read_key: key of input data to read from data store + :param str reference_key: key of input data to read from data store + :param str store_key: key of output data to store in data store + :param str assign_to_key: key of the input data to assign function applied-output to. (optional) + :param int window: size of rolling window, default is 1. + :param int shift: shift of rolling window. default is 1. + :param str hist_col: column/key in input df/dict that contains the histogram. default is 'histogram'. + :param str suffix1: column/key of rolling histogram. default is 'roll' -> column = 'histogram_roll' + :param str suffix2: column/key of external histogram. default is 'ref' -> column = 'histogram_ref' + :param str prefix: prefix of comparisons metrics. default is 'rollref' + :param float max_res_bound: count number of normalized residuals with (absolute) value greater than X. + Default is 7.0. + """ + if assign_to_key is None: + assign_to_key = read_key + + # make rolling reference histograms + hist_collector1 = ApplyFunc( + apply_to_key=read_key, + assign_to_key=assign_to_key, + ) + hist_collector1.add_apply_func( + func=rolling_hist, + entire=True, + suffix=suffix1, + window=window, + shift=shift, + hist_name=hist_col, + ) + + # make fixed (external) reference histograms + hist_collector2 = ApplyFunc( + apply_to_key=reference_key, + assign_to_key=assign_to_key, + ) + hist_collector2.add_apply_func( + func=hist_sum, entire=True, suffix=suffix2, metrics=[hist_col] + ) + + # do histogram comparison + hist_comparer = ApplyFunc( + apply_to_key=assign_to_key, + assign_to_key=store_key, + apply_funcs=[ + { + "func": hist_compare, + "hist_name1": hist_col + "_" + suffix1, + "hist_name2": hist_col + "_" + suffix2, + "prefix": prefix, + "axis": 1, + "max_res_bound": max_res_bound, + } + ], + ) + + super().__init__(modules=[hist_collector1, hist_collector2, hist_comparer]) + + self.read_key = read_key + self.reference_key = reference_key + self.window = window + + def transform(self, datastore): + self.logger.info( + f'Comparing "{self.read_key}" with rolling sum of {self.window} slots to {self.reference_key}.' + ) + return super().transform(datastore) + + class NormHistComparer(Pipeline): """Base pipeline to compare histogram to normalized histograms""" From 5f1599e1a979a527be8433ca90e3580927d92b03 Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Tue, 10 May 2022 13:10:45 +0200 Subject: [PATCH 2/3] refactor: remove metrics pipeline code duplication --- popmon/pipeline/metrics_pipelines.py | 445 +++++++++++---------------- 1 file changed, 182 insertions(+), 263 deletions(-) diff --git a/popmon/pipeline/metrics_pipelines.py b/popmon/pipeline/metrics_pipelines.py index 4d8328d0..c9dabfb5 100644 --- a/popmon/pipeline/metrics_pipelines.py +++ b/popmon/pipeline/metrics_pipelines.py @@ -16,7 +16,7 @@ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - +from typing import List, Union from ..alerting import ( AlertsSummary, @@ -40,7 +40,7 @@ RefMedianMadPullCalculator, RollingPullCalculator, ) -from ..base import Pipeline +from ..base import Module, Pipeline from ..hist.hist_splitter import HistSplitter @@ -94,6 +94,124 @@ def create_metrics_pipeline( return pipeline +def get_splitting_modules( + hists_key, features, time_axis +) -> List[Union[Module, Pipeline]]: + """ + Splitting of test histograms. For each histogram with datetime i, comparison of histogram i with histogram i-1, + results in chi2 comparison of histograms + """ + modules: List[Union[Module, Pipeline]] = [ + HistSplitter( + read_key=hists_key, + store_key="split_hists", + features=features, + feature_begins_with=f"{time_axis}:", + ), + PreviousHistComparer(read_key="split_hists", store_key="comparisons"), + HistProfiler(read_key="split_hists", store_key="profiles"), + ] + return modules + + +def get_traffic_light_modules(monitoring_rules) -> List[Union[Module, Pipeline]]: + """ + Expand all (wildcard) static traffic light bounds and apply them. + Applied to both profiles and comparisons datasets + """ + modules: List[Union[Module, Pipeline]] = [ + TrafficLightAlerts( + read_key="profiles", + rules=monitoring_rules, + store_key="traffic_lights", + expanded_rules_key="static_bounds", + ), + TrafficLightAlerts( + read_key="comparisons", + rules=monitoring_rules, + store_key="traffic_lights", + expanded_rules_key="static_bounds_comparisons", + ), + ApplyFunc( + apply_to_key="traffic_lights", + apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], + assign_to_key="alerts", + msg="Generating traffic light alerts summary.", + ), + AlertsSummary(read_key="alerts"), + ] + return modules + + +def get_static_bound_modules(pull_rules) -> List[Union[Module, Pipeline]]: + """ + generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, used for + plotting in popmon_profiles report. + """ + modules: List[Union[Module, Pipeline]] = [ + StaticBounds( + read_key="profiles", + rules=pull_rules, + store_key="dynamic_bounds", + suffix_mean="_mean", + suffix_std="_std", + ), + StaticBounds( + read_key="comparisons", + rules=pull_rules, + store_key="dynamic_bounds_comparisons", + suffix_mean="_mean", + suffix_std="_std", + ), + ] + return modules + + +def get_dynamic_bound_modules(pull_rules) -> List[Union[Module, Pipeline]]: + """ + Generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, used for + plotting in popmon_profiles report. + """ + modules: List[Union[Module, Pipeline]] = [ + DynamicBounds( + read_key="profiles", + rules=pull_rules, + store_key="dynamic_bounds", + suffix_mean="_mean", + suffix_std="_std", + ), + DynamicBounds( + read_key="comparisons", + rules=pull_rules, + store_key="dynamic_bounds_comparisons", + suffix_mean="_mean", + suffix_std="_std", + ), + ] + return modules + + +def get_trend_modules(window) -> List[Union[Module, Pipeline]]: + """Looking for significant rolling linear trends in selected features/metrics""" + modules: List[Union[Module, Pipeline]] = [ + ApplyFunc( + apply_to_key="profiles", + assign_to_key="comparisons", + apply_funcs=[ + { + "func": rolling_lr_zscore, + "suffix": f"_trend{window}_zscore", + "entire": True, + "window": window, + "metrics": ["mean", "phik", "fraction_true"], + } + ], + msg="Computing significance of (rolling) trend in means of features", + ), + ] + return modules + + class SelfReferenceMetricsPipeline(Pipeline): def __init__( self, @@ -118,17 +236,8 @@ def __init__( """ from popmon.analysis.comparison.comparisons import Comparisons - modules = [ - # 1. splitting of test histograms - HistSplitter( - read_key=hists_key, - store_key="split_hists", - features=features, - feature_begins_with=f"{time_axis}:", - ), - # 2. for each histogram with datetime i, comparison of histogram i with histogram i-1, results in - # chi2 comparison of histograms - PreviousHistComparer(read_key="split_hists", store_key="comparisons"), + reference_prefix = "ref" + reference_modules: List[Union[Module, Pipeline]] = [ # 3. Comparison of with profiled test histograms, results in chi2 comparison of histograms ReferenceHistComparer( reference_key="split_hists", @@ -141,11 +250,13 @@ def __init__( suffix_mean="_mean", suffix_std="_std", suffix_pull="_pull", - metrics=[f"ref_{key}" for key in Comparisons.get_comparisons().keys()], + metrics=[ + f"{reference_prefix}_{key}" + for key in Comparisons.get_comparisons().keys() + ], ), # 4. profiling of histograms, then pull calculation compared with reference mean and std, # to obtain normalized residuals of profiles - HistProfiler(read_key="split_hists", store_key="profiles"), RefMedianMadPullCalculator( reference_key="profiles", assign_to_key="profiles", @@ -153,59 +264,15 @@ def __init__( suffix_std="_std", suffix_pull="_pull", ), - # 5. looking for significant rolling linear trends in selected features/metrics - ApplyFunc( - apply_to_key="profiles", - assign_to_key="comparisons", - apply_funcs=[ - { - "func": rolling_lr_zscore, - "suffix": f"_trend{window}_zscore", - "entire": True, - "window": window, - "metrics": ["mean", "phik", "fraction_true"], - } - ], - msg="Computing significance of (rolling) trend in means of features", - ), - # 6. generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, - # used for plotting in popmon_profiles report. - StaticBounds( - read_key="profiles", - rules=pull_rules, - store_key="dynamic_bounds", - suffix_mean="_mean", - suffix_std="_std", - ), - StaticBounds( - read_key="comparisons", - rules=pull_rules, - store_key="dynamic_bounds_comparisons", - suffix_mean="_mean", - suffix_std="_std", - ), - # 7. expand all (wildcard) static traffic light bounds and apply them. - # Applied to both profiles and comparisons datasets - TrafficLightAlerts( - read_key="profiles", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds", - ), - TrafficLightAlerts( - read_key="comparisons", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds_comparisons", - ), - ApplyFunc( - apply_to_key="traffic_lights", - apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], - assign_to_key="alerts", - msg="Generating traffic light alerts summary.", - ), - AlertsSummary(read_key="alerts"), ] + + modules = ( + get_splitting_modules(hists_key, features, time_axis) + + reference_modules + + get_trend_modules(window) + + get_static_bound_modules(pull_rules) + + get_traffic_light_modules(monitoring_rules) + ) super().__init__(modules) @@ -233,17 +300,10 @@ def __init__( :param kwargs: residual keyword arguments :return: assembled external reference pipeline """ - modules = [ - # 1. splitting of test histograms - HistSplitter( - read_key=hists_key, - store_key="split_hists", - features=features, - feature_begins_with=f"{time_axis}:", - ), - # 2. for each histogram with datetime i, comparison of histogram i with histogram i-1, results in - # chi2 comparison of histograms - PreviousHistComparer(read_key="split_hists", store_key="comparisons"), + from popmon.analysis.comparison.comparisons import Comparisons + + reference_prefix = "ref" + reference_modules: List[Union[Module, Pipeline]] = [ # 3. Profiling of split reference histograms, then chi2 comparison with test histograms HistSplitter( read_key=ref_hists_key, @@ -256,17 +316,19 @@ def __init__( assign_to_key="split_hists", store_key="comparisons", ), + HistProfiler(read_key="split_ref_hists", store_key="ref_profiles"), RefMedianMadPullCalculator( reference_key="comparisons", assign_to_key="comparisons", suffix_mean="_mean", suffix_std="_std", suffix_pull="_pull", - metrics=["ref_max_prob_diff"], + metrics=[ + f"{reference_prefix}_{key}" + for key in Comparisons.get_comparisons().keys() + ], ), # 4. pull calculation compared with reference mean and std, to obtain normalized residuals of profiles - HistProfiler(read_key="split_hists", store_key="profiles"), - HistProfiler(read_key="split_ref_hists", store_key="ref_profiles"), ReferencePullCalculator( reference_key="ref_profiles", assign_to_key="profiles", @@ -274,59 +336,14 @@ def __init__( suffix_std="_std", suffix_pull="_pull", ), - # 5. looking for significant rolling linear trends in selected features/metrics - ApplyFunc( - apply_to_key="profiles", - assign_to_key="comparisons", - apply_funcs=[ - { - "func": rolling_lr_zscore, - "suffix": f"_trend{window}_zscore", - "entire": True, - "window": window, - "metrics": ["mean", "phik", "fraction_true"], - } - ], - msg="Computing significance of (rolling) trend in means of features", - ), - # 6. generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, - # used for plotting in popmon_profiles report. - StaticBounds( - read_key="profiles", - rules=pull_rules, - store_key="dynamic_bounds", - suffix_mean="_mean", - suffix_std="_std", - ), - StaticBounds( - read_key="comparisons", - rules=pull_rules, - store_key="dynamic_bounds_comparisons", - suffix_mean="_mean", - suffix_std="_std", - ), - # 7. expand all (wildcard) static traffic light bounds and apply them. - # Applied to both profiles and comparisons datasets - TrafficLightAlerts( - read_key="profiles", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds", - ), - TrafficLightAlerts( - read_key="comparisons", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds_comparisons", - ), - ApplyFunc( - apply_to_key="traffic_lights", - apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], - assign_to_key="alerts", - msg="Generating traffic light alerts summary.", - ), - AlertsSummary(read_key="alerts"), ] + modules = ( + get_splitting_modules(hists_key, features, time_axis) + + reference_modules + + get_trend_modules(window) + + get_static_bound_modules(pull_rules) + + get_traffic_light_modules(monitoring_rules) + ) super().__init__(modules) @@ -354,17 +371,10 @@ def __init__( :param kwargs: residual keyword arguments :return: assembled rolling reference pipeline """ - modules = [ - # 1. splitting of test histograms - HistSplitter( - read_key=hists_key, - store_key="split_hists", - features=features, - feature_begins_with=f"{time_axis}:", - ), - # 2. for each histogram with datetime i, comparison of histogram i with histogram i-1, results in - # chi2 comparison of histograms - PreviousHistComparer(read_key="split_hists", store_key="comparisons"), + from popmon.analysis.comparison.comparisons import Comparisons + + reference_prefix = "roll" + reference_modules: List[Union[Module, Pipeline]] = [ # 3. profiling of reference histograms, then comparison of with profiled test histograms # results in chi2 comparison of histograms RollingHistComparer( @@ -379,11 +389,13 @@ def __init__( suffix_mean="_mean", suffix_std="_std", suffix_pull="_pull", - metrics=["roll_max_prob_diff"], + metrics=[ + f"{reference_prefix}_{key}" + for key in Comparisons.get_comparisons().keys() + ], ), # 4. profiling of histograms, then pull calculation compared with reference mean and std, # to obtain normalized residuals of profiles - HistProfiler(read_key="split_hists", store_key="profiles"), RollingPullCalculator( read_key="profiles", window=window, @@ -392,59 +404,15 @@ def __init__( suffix_std="_std", suffix_pull="_pull", ), - # 5. looking for significant rolling linear trends in selected features/metrics - ApplyFunc( - apply_to_key="profiles", - assign_to_key="comparisons", - apply_funcs=[ - { - "func": rolling_lr_zscore, - "suffix": f"_trend{window}_zscore", - "entire": True, - "window": window, - "metrics": ["mean", "phik", "fraction_true"], - } - ], - msg="Computing significance of (rolling) trend in means of features", - ), - # 6. generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, - # used for plotting in popmon_profiles report. - DynamicBounds( - read_key="profiles", - rules=pull_rules, - store_key="dynamic_bounds", - suffix_mean="_mean", - suffix_std="_std", - ), - DynamicBounds( - read_key="comparisons", - rules=pull_rules, - store_key="dynamic_bounds_comparisons", - suffix_mean="_mean", - suffix_std="_std", - ), - # 7. expand all (wildcard) static traffic light bounds and apply them. - # Applied to both profiles and comparisons datasets - TrafficLightAlerts( - read_key="profiles", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds", - ), - TrafficLightAlerts( - read_key="comparisons", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds_comparisons", - ), - ApplyFunc( - apply_to_key="traffic_lights", - apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], - assign_to_key="alerts", - msg="Generating traffic light alerts summary.", - ), - AlertsSummary(read_key="alerts"), ] + + modules = ( + get_splitting_modules(hists_key, features, time_axis) + + reference_modules + + get_trend_modules(window) + + get_dynamic_bound_modules(pull_rules) + + get_traffic_light_modules(monitoring_rules) + ) super().__init__(modules) @@ -472,17 +440,10 @@ def __init__( :param kwargs: residual keyword arguments :return: assembled expanding reference pipeline """ - modules = [ - # 1. splitting of test histograms - HistSplitter( - read_key=hists_key, - store_key="split_hists", - features=features, - feature_begins_with=f"{time_axis}:", - ), - # 2. for each histogram with datetime i, comparison of histogram i with histogram i-1, results in - # chi2 comparison of histograms - PreviousHistComparer(read_key="split_hists", store_key="comparisons"), + from popmon.analysis.comparison.comparisons import Comparisons + + reference_prefix = "expanding" + reference_modules: List[Union[Module, Pipeline]] = [ # 3. profiling of reference histograms, then comparison of with profiled test histograms # results in chi2 comparison of histograms ExpandingHistComparer( @@ -496,9 +457,11 @@ def __init__( suffix_mean="_mean", suffix_std="_std", suffix_pull="_pull", - metrics=["expanding_max_prob_diff"], + metrics=[ + f"{reference_prefix}_{key}" + for key in Comparisons.get_comparisons().keys() + ], ), - HistProfiler(read_key="split_hists", store_key="profiles"), ExpandingPullCalculator( read_key="profiles", shift=shift, @@ -506,57 +469,13 @@ def __init__( suffix_std="_std", suffix_pull="_pull", ), - # 5. looking for significant rolling linear trends in selected features/metrics - ApplyFunc( - apply_to_key="profiles", - assign_to_key="comparisons", - apply_funcs=[ - { - "func": rolling_lr_zscore, - "suffix": f"_trend{window}_zscore", - "entire": True, - "window": window, - "metrics": ["mean", "phik", "fraction_true"], - } - ], - msg="Computing significance of (rolling) trend in means of features", - ), - # 6. generate dynamic traffic light boundaries, based on traffic lights for normalized residuals, - # used for plotting in popmon_profiles report. - DynamicBounds( - read_key="profiles", - rules=pull_rules, - store_key="dynamic_bounds", - suffix_mean="_mean", - suffix_std="_std", - ), - DynamicBounds( - read_key="comparisons", - rules=pull_rules, - store_key="dynamic_bounds_comparisons", - suffix_mean="_mean", - suffix_std="_std", - ), - # 7. expand all (wildcard) static traffic light bounds and apply them. - # Applied to both profiles and comparisons datasets - TrafficLightAlerts( - read_key="profiles", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds", - ), - TrafficLightAlerts( - read_key="comparisons", - rules=monitoring_rules, - store_key="traffic_lights", - expanded_rules_key="static_bounds_comparisons", - ), - ApplyFunc( - apply_to_key="traffic_lights", - apply_funcs=[{"func": traffic_light_summary, "axis": 1, "suffix": ""}], - assign_to_key="alerts", - msg="Generating traffic light alerts summary.", - ), - AlertsSummary(read_key="alerts"), ] + + modules = ( + get_splitting_modules(hists_key, features, time_axis) + + reference_modules + + get_trend_modules(window) + + get_dynamic_bound_modules(pull_rules) + + get_traffic_light_modules(monitoring_rules) + ) super().__init__(modules) From 69fd2531e05324ce4fdf70965aef3ed8475b1f3c Mon Sep 17 00:00:00 2001 From: Simon Brugman Date: Fri, 20 May 2022 13:37:41 +0200 Subject: [PATCH 3/3] refactor: expose prefix parameter on hist comparer --- popmon/analysis/comparison/hist_comparer.py | 14 +++-- .../analysis/comparison/test_hist_comparer.py | 51 ++++++++++--------- 2 files changed, 38 insertions(+), 27 deletions(-) diff --git a/popmon/analysis/comparison/hist_comparer.py b/popmon/analysis/comparison/hist_comparer.py index 0b82924e..54d168e7 100644 --- a/popmon/analysis/comparison/hist_comparer.py +++ b/popmon/analysis/comparison/hist_comparer.py @@ -151,7 +151,8 @@ def __init__( store_key, assign_to_key=None, hist_col="histogram", - suffix="comp", + suffix="prev", + prefix="prev", max_res_bound=7.0, *args, **kwargs, @@ -163,7 +164,8 @@ def __init__( :param str store_key: key of output data to store in data store :param str assign_to_key: key of the input data to assign function applied-output to. (optional) :param str hist_col: column/key in input df/dict that contains the histogram. default is 'histogram' - :param str suffix: column/key of rolling histogram. default is 'roll' -> column = 'histogram_roll' + :param str suffix: column/key of rolling histogram. default is 'ref' -> column = 'histogram_ref' + :param str prefix: column/key of comparisons. default is 'comp' -> column = 'comp_pearson' :param float max_res_bound: count number of normalized residuals with (absolute) value greater than X. Default is 7.0. :param args: (tuple, optional): residual args passed on to func_mean and func_std @@ -189,7 +191,7 @@ def __init__( "func": hist_compare, "hist_name1": hist_col, "hist_name2": hist_col + "_" + suffix, - "prefix": suffix, + "prefix": prefix, "axis": 1, "max_res_bound": max_res_bound, } @@ -210,6 +212,7 @@ def __init__( shift=1, hist_col="histogram", suffix="roll", + prefix="rolling", max_res_bound=7.0, ): """Initialize an instance of RollingHistComparer. @@ -230,6 +233,7 @@ def __init__( read_key, hist_col, suffix, + prefix, max_res_bound, window=window, shift=shift, @@ -286,6 +290,7 @@ def __init__( shift=1, hist_col="histogram", suffix="expanding", + prefix="expanding", max_res_bound=7.0, ): """Initialize an instance of ExpandingHistComparer. @@ -305,6 +310,7 @@ def __init__( read_key, hist_col, suffix, + prefix, max_res_bound, shift=shift, hist_name=hist_col, @@ -328,6 +334,7 @@ def __init__( store_key, hist_col="histogram", suffix="ref", + prefix="reference", max_res_bound=7.0, ): """Initialize an instance of ReferenceHistComparer. @@ -347,6 +354,7 @@ def __init__( assign_to_key, hist_col, suffix, + prefix, max_res_bound, metrics=[hist_col], ) diff --git a/tests/popmon/analysis/comparison/test_hist_comparer.py b/tests/popmon/analysis/comparison/test_hist_comparer.py index 466b5c56..ec358387 100644 --- a/tests/popmon/analysis/comparison/test_hist_comparer.py +++ b/tests/popmon/analysis/comparison/test_hist_comparer.py @@ -68,25 +68,26 @@ def test_hist_compare(): def test_reference_hist_comparer(): - hist_list = ["date:country", "date:bankrupt", "date:num_employees", "date:A_score"] features = ["country", "bankrupt", "num_employees", "A_score"] + prefix = "my_pref" + suffix = "my_ref" cols = [ - "ref_pearson", - "ref_chi2", - "ref_chi2_zscore", - "ref_chi2_norm", - "ref_chi2_pvalue", - "ref_chi2_max_residual", - "ref_chi2_spike_count", - "ref_ks", - "ref_ks_zscore", - "ref_ks_pvalue", - "ref_max_prob_diff", - "ref_jsd", - "ref_psi", - "ref_unknown_labels", + f"{prefix}_pearson", + f"{prefix}_chi2", + f"{prefix}_chi2_zscore", + f"{prefix}_chi2_norm", + f"{prefix}_chi2_pvalue", + f"{prefix}_chi2_max_residual", + f"{prefix}_chi2_spike_count", + f"{prefix}_ks", + f"{prefix}_ks_zscore", + f"{prefix}_ks_pvalue", + f"{prefix}_max_prob_diff", + f"{prefix}_jsd", + f"{prefix}_psi", + f"{prefix}_unknown_labels", ] pipeline = Pipeline( @@ -102,41 +103,44 @@ def test_reference_hist_comparer(): reference_key="output_hist", assign_to_key="output_hist", store_key="comparison", + suffix=suffix, + prefix=prefix, ), ] ) datastore = pipeline.transform(datastore={}) + assert set(datastore.keys()) == {"example_hist", "output_hist", "comparison"} assert "comparison" in datastore and isinstance(datastore["comparison"], dict) assert len(datastore["comparison"].keys()) == len(features) for f in features: assert f in datastore["comparison"] - for f in features: assert isinstance(datastore["comparison"][f], pd.DataFrame) + assert f in datastore["output_hist"] + assert f"histogram_{suffix}" in datastore["output_hist"][f] df = datastore["comparison"]["A_score"] assert len(df) == 16 np.testing.assert_array_equal(sorted(df.columns), sorted(cols)) - np.testing.assert_almost_equal(df["ref_chi2"].mean(), 2.623206018518519) + np.testing.assert_almost_equal(df[f"{prefix}_chi2"].mean(), 2.623206018518519) df = datastore["comparison"]["country"] assert len(df) == 17 np.testing.assert_array_equal(sorted(df.columns), sorted(cols)) - np.testing.assert_almost_equal(df["ref_chi2"].mean(), 0.9804481792717087) + np.testing.assert_almost_equal(df[f"{prefix}_chi2"].mean(), 0.9804481792717087) df = datastore["comparison"]["bankrupt"] assert len(df) == 17 np.testing.assert_array_equal(sorted(df.columns), sorted(cols)) - np.testing.assert_almost_equal(df["ref_chi2"].mean(), 0.6262951496388027) + np.testing.assert_almost_equal(df[f"{prefix}_chi2"].mean(), 0.6262951496388027) df = datastore["comparison"]["num_employees"] assert len(df) == 17 np.testing.assert_array_equal(sorted(df.columns), sorted(cols)) - np.testing.assert_almost_equal(df["ref_chi2"].mean(), 4.213429217840983) + np.testing.assert_almost_equal(df[f"{prefix}_chi2"].mean(), 4.213429217840983) def test_expanding_hist_comparer(): - hist_list = ["date:country", "date:bankrupt", "date:num_employees", "date:A_score"] features = ["country", "bankrupt", "num_employees", "A_score"] @@ -164,7 +168,7 @@ def test_expanding_hist_comparer(): store_key="example_hist", ), HistSplitter( - read_key="example_hist", store_key="output_hist", features=hist_list + read_key="example_hist", store_key="output_hist", features=hist_list, prefix="expanding" ), ExpandingHistComparer(read_key="output_hist", store_key="comparison"), ] @@ -202,7 +206,6 @@ def test_expanding_hist_comparer(): @pytest.mark.filterwarnings("ignore:An input array is constant") @pytest.mark.filterwarnings("ignore:invalid value encountered in true_divide") def test_rolling_hist_comparer(): - hist_list = ["date:country", "date:bankrupt", "date:num_employees", "date:A_score"] features = ["country", "bankrupt", "num_employees", "A_score"] @@ -233,7 +236,7 @@ def test_rolling_hist_comparer(): read_key="example_hist", store_key="output_hist", features=hist_list ), RollingHistComparer( - read_key="output_hist", store_key="comparison", window=5 + read_key="output_hist", store_key="comparison", window=5, prefix="roll" ), ] )