Skip to content

Commit

Permalink
Added option for node admin to create whitelist/blacklist of which co…
Browse files Browse the repository at this point in the history
…lumns they want to allow to be requested
  • Loading branch information
bartvanb committed Apr 5, 2024
1 parent 0f7bb7f commit 0331858
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 7 deletions.
21 changes: 21 additions & 0 deletions docs/v6-crosstab-py/privacy.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,27 @@ records:
contains only unique values, the result would reveal which unique values are present
in the column.

- **Setting the allowed columns**: The node administrator can set on which
columns they want to allow or disallow the computation of the contingency table by
adding the following to the node configuration file:

.. code:: yaml
algorithm_env:
CROSSTAB_ALLOWED_COLUMNS: ["ageGroup", "isOverweight"]
CROSSTAB_DISALLOWED_COLUMNS: ["age", "weight"]
This configuration will ensure that only the columns `ageGroup` and `isOverweight`
are allowed to be used in the computation of the contingency table. The columns `age`
and `weight` are disallowed and will not be used in the computation. Usually, there
should either be an allowed or disallowed list, but not both: if there is an explicit
allowed list, all other columns are automatically disallowed.

We recommend to define these list to ensure that the contingency table can only be
computed for categorical tables, and not for numeric ones. The latter run a risk of
revealing that certain values are present in the data. We chose to let the node
administrator handle this as they know their own data best.

- **Minimum number of data rows to participate**: A node will only participate if it
contains at least `n` data rows. This is to prevent nodes with very little data from
participating in the computation. By default, the minimum number of data rows is set
Expand Down
41 changes: 34 additions & 7 deletions v6-crosstab-py/partial.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,13 @@ def partial_crosstab(
PRIVACY_THRESHOLD = _convert_envvar_to_int(
"CROSSTAB_PRIVACY_THRESHOLD", DEFAULT_PRIVACY_THRESHOLD
)
MINIMUM_ROWS_TOTAL = _convert_envvar_to_int(
"CROSSTAB_MINIMUM_ROWS_TOTAL", DEFAULT_MINIMUM_ROWS_TOTAL
)
ALLOW_ZERO = _convert_envvar_to_bool("CROSSTAB_ALLOW_ZERO", DEFAULT_ALLOW_ZERO)

# check if env var values are compatible
info("Checking privacy settings before starting...")
_do_prestart_privacy_checks(df, PRIVACY_THRESHOLD, ALLOW_ZERO, MINIMUM_ROWS_TOTAL)
_do_prestart_privacy_checks(
df, group_cols + [results_col], PRIVACY_THRESHOLD, ALLOW_ZERO
)

# Fill empty (categorical) values with "N/A"
df = df.fillna("N/A")
Expand Down Expand Up @@ -112,7 +111,10 @@ def partial_crosstab(


def _do_prestart_privacy_checks(
df: pd.DataFrame, privacy_threshold: int, allow_zero: bool, minimum_rows_total: int
df: pd.DataFrame,
requested_columns: list[str],
privacy_threshold: int,
allow_zero: bool,
) -> None:
"""
Perform privacy checks before starting the computation.
Expand All @@ -121,13 +123,16 @@ def _do_prestart_privacy_checks(
----------
df : pd.DataFrame
The dataframe containing the data.
requested_columns : list[str]
The columns requested for the computation.
privacy_threshold : int
The privacy threshold value.
allow_zero : bool
The flag indicating whether zero values are allowed.
minimum_rows_total : int
The minimum number of rows to be found in the supplied dataframe.
"""
minimum_rows_total = _convert_envvar_to_int(
"CROSSTAB_MINIMUM_ROWS_TOTAL", DEFAULT_MINIMUM_ROWS_TOTAL
)

if privacy_threshold == 0 and not allow_zero:
raise ValueError(
Expand All @@ -142,6 +147,28 @@ def _do_prestart_privacy_checks(
"handle this computation, as it may lead to privacy issues."
)

# Check if requested columns are allowed
allowed_columns = get_env_var("CROSSTAB_ALLOWED_COLUMNS")
if allowed_columns:
allowed_columns = allowed_columns.split(",")
for col in requested_columns:
if col not in allowed_columns:
raise ValueError(
f"The node administrator does not allow '{col}' to be requested in "
"this algorithm computation. Please contact the node administrator "
"for more information."
)
non_allowed_collumns = get_env_var("CROSSTAB_DISALLOWED_COLUMNS")
if non_allowed_collumns:
non_allowed_collumns = non_allowed_collumns.split(",")
for col in requested_columns:
if col in non_allowed_collumns:
raise ValueError(
f"The node administrator does not allow '{col}' to be requested in "
"this algorithm computation. Please contact the node administrator "
"for more information."
)


def _get_threshold_placeholder(privacy_threshold: int, allow_zero: bool) -> str:
"""
Expand Down

0 comments on commit 0331858

Please sign in to comment.