From 03318589bc4702ee1c7e6e86ed9a45ebe8b10c0a Mon Sep 17 00:00:00 2001 From: Bart van Beusekom Date: Fri, 5 Apr 2024 17:50:44 +0200 Subject: [PATCH] Added option for node admin to create whitelist/blacklist of which columns they want to allow to be requested --- docs/v6-crosstab-py/privacy.rst | 21 +++++++++++++++++ v6-crosstab-py/partial.py | 41 +++++++++++++++++++++++++++------ 2 files changed, 55 insertions(+), 7 deletions(-) diff --git a/docs/v6-crosstab-py/privacy.rst b/docs/v6-crosstab-py/privacy.rst index 9165a61..5bc3f3e 100644 --- a/docs/v6-crosstab-py/privacy.rst +++ b/docs/v6-crosstab-py/privacy.rst @@ -28,6 +28,27 @@ records: contains only unique values, the result would reveal which unique values are present in the column. +- **Setting the allowed columns**: The node administrator can set on which + columns they want to allow or disallow the computation of the contingency table by + adding the following to the node configuration file: + + .. code:: yaml + + algorithm_env: + CROSSTAB_ALLOWED_COLUMNS: ["ageGroup", "isOverweight"] + CROSSTAB_DISALLOWED_COLUMNS: ["age", "weight"] + + This configuration will ensure that only the columns `ageGroup` and `isOverweight` + are allowed to be used in the computation of the contingency table. The columns `age` + and `weight` are disallowed and will not be used in the computation. Usually, there + should either be an allowed or disallowed list, but not both: if there is an explicit + allowed list, all other columns are automatically disallowed. + + We recommend to define these list to ensure that the contingency table can only be + computed for categorical tables, and not for numeric ones. The latter run a risk of + revealing that certain values are present in the data. We chose to let the node + administrator handle this as they know their own data best. + - **Minimum number of data rows to participate**: A node will only participate if it contains at least `n` data rows. This is to prevent nodes with very little data from participating in the computation. By default, the minimum number of data rows is set diff --git a/v6-crosstab-py/partial.py b/v6-crosstab-py/partial.py index a156d4b..aacc763 100644 --- a/v6-crosstab-py/partial.py +++ b/v6-crosstab-py/partial.py @@ -45,14 +45,13 @@ def partial_crosstab( PRIVACY_THRESHOLD = _convert_envvar_to_int( "CROSSTAB_PRIVACY_THRESHOLD", DEFAULT_PRIVACY_THRESHOLD ) - MINIMUM_ROWS_TOTAL = _convert_envvar_to_int( - "CROSSTAB_MINIMUM_ROWS_TOTAL", DEFAULT_MINIMUM_ROWS_TOTAL - ) ALLOW_ZERO = _convert_envvar_to_bool("CROSSTAB_ALLOW_ZERO", DEFAULT_ALLOW_ZERO) # check if env var values are compatible info("Checking privacy settings before starting...") - _do_prestart_privacy_checks(df, PRIVACY_THRESHOLD, ALLOW_ZERO, MINIMUM_ROWS_TOTAL) + _do_prestart_privacy_checks( + df, group_cols + [results_col], PRIVACY_THRESHOLD, ALLOW_ZERO + ) # Fill empty (categorical) values with "N/A" df = df.fillna("N/A") @@ -112,7 +111,10 @@ def partial_crosstab( def _do_prestart_privacy_checks( - df: pd.DataFrame, privacy_threshold: int, allow_zero: bool, minimum_rows_total: int + df: pd.DataFrame, + requested_columns: list[str], + privacy_threshold: int, + allow_zero: bool, ) -> None: """ Perform privacy checks before starting the computation. @@ -121,13 +123,16 @@ def _do_prestart_privacy_checks( ---------- df : pd.DataFrame The dataframe containing the data. + requested_columns : list[str] + The columns requested for the computation. privacy_threshold : int The privacy threshold value. allow_zero : bool The flag indicating whether zero values are allowed. - minimum_rows_total : int - The minimum number of rows to be found in the supplied dataframe. """ + minimum_rows_total = _convert_envvar_to_int( + "CROSSTAB_MINIMUM_ROWS_TOTAL", DEFAULT_MINIMUM_ROWS_TOTAL + ) if privacy_threshold == 0 and not allow_zero: raise ValueError( @@ -142,6 +147,28 @@ def _do_prestart_privacy_checks( "handle this computation, as it may lead to privacy issues." ) + # Check if requested columns are allowed + allowed_columns = get_env_var("CROSSTAB_ALLOWED_COLUMNS") + if allowed_columns: + allowed_columns = allowed_columns.split(",") + for col in requested_columns: + if col not in allowed_columns: + raise ValueError( + f"The node administrator does not allow '{col}' to be requested in " + "this algorithm computation. Please contact the node administrator " + "for more information." + ) + non_allowed_collumns = get_env_var("CROSSTAB_DISALLOWED_COLUMNS") + if non_allowed_collumns: + non_allowed_collumns = non_allowed_collumns.split(",") + for col in requested_columns: + if col in non_allowed_collumns: + raise ValueError( + f"The node administrator does not allow '{col}' to be requested in " + "this algorithm computation. Please contact the node administrator " + "for more information." + ) + def _get_threshold_placeholder(privacy_threshold: int, allow_zero: bool) -> str: """