diff --git a/docs/conf.py b/docs/conf.py index a9da830..2cd503e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -8,7 +8,8 @@ # documentation root, use os.path.abspath to make it absolute, like shown here. # -# import os +import os + # import sys # sys.path.insert(0, os.path.abspath('../package')) @@ -29,6 +30,7 @@ "sphinx.ext.autosectionlabel", "sphinx.ext.intersphinx", "sphinx_click.ext", + "sphinxcontrib.plantuml", ] @@ -54,3 +56,9 @@ pygments_style = None numfig = False + +plantuml_output_format = "svg_img" +local_plantuml_path = os.path.join( + os.path.dirname(__file__), "static", "java", "plantuml.jar" +) +plantuml = f"java -Djava.awt.headless=true -jar {local_plantuml_path}" diff --git a/docs/index.rst b/docs/index.rst index d3ea58f..8015ee6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -4,17 +4,21 @@ Overview Description ----------- -.. Give short description of the algorithm on this main page of the docs +This algorithm computes a cross-table a.ka. +`contingency table `_ for two or more +categorical variables. The algorithm takes categorical variables as input and returns a +table with the counts of the number of occurrences of each combination of categories. Authors ------- -.. List authors. +Bart van Beusekom, Frank Martin, Hasan Alradhi, *Netherlands Comphrensive Cancer Organisation (IKNL).* Source code ----------- -.. Describe where to find source code and docker files +Source code is available in the following +`GitHub repository `_. Contents diff --git a/docs/requirements.txt b/docs/requirements.txt index 99d09ba..a22c769 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -3,4 +3,5 @@ sphinx==5.3.0 sphinx-autobuild sphinx-autodoc-typehints sphinx-click==4.4.0 +sphinxcontrib-plantuml furo==2022.12.7 \ No newline at end of file diff --git a/docs/static/java/plantuml.jar b/docs/static/java/plantuml.jar new file mode 100644 index 0000000..aa8d50f Binary files /dev/null and b/docs/static/java/plantuml.jar differ diff --git a/docs/v6-crosstab-py/implementation.rst b/docs/v6-crosstab-py/implementation.rst index a19f7bf..7313d9c 100644 --- a/docs/v6-crosstab-py/implementation.rst +++ b/docs/v6-crosstab-py/implementation.rst @@ -4,20 +4,76 @@ Implementation Overview -------- -Central (``central_crosstab``) ------------------ -The central part is responsible for the orchestration and aggregation of the algorithm. +The implementation is rather straightforward. The central part requests the partial +contingency table from each node, which they compute in one go. The central part then +aggregates the partial contingency tables to the final table. + +.. uml:: + + !theme superhero-outline + + caption The central part of the algorithm is responsible for the \ + orchestration and aggregation\n of the algorithm. The partial \ + parts are executed on each node. + + |client| + :request analysis; + + |central| + :Collect organizations + in collaboration; + :Create partial tasks; + + |partial| + :Partial_crosstab creates + partial contingency tables; + + |partial| + :Mask values below + privacy threshold; + + |central| + :Combine contingency + tables; + + |client| + :Receive results; -.. Describe the central function here. Partials -------- + Partials are the computations that are executed on each node. The partials have access to the data that is stored on the node. The partials are executed in parallel on each node. ``partial_crosstab`` -~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~ + +The partial function computes the local contingency table. Any values below the privacy threshold +are converted to a range. For example, if the privacy threshold is 5, then all values +below 5 are converted to '0-4'. The local contingency table is sent to the central part. + +The partial function includes several privacy checks - see the +:ref:`privacy guards ` section for more information. + +Central +------- + +The central part is responsible for the aggregation of the cross-tables of individual +nodes. + +``central_crosstab`` +~~~~~~~~~~~~~~~~~~~~ + +The central part sums up the local contingency tables resulting in the global +contingency table. -.. Describe the partial function. +- If the local contingency table contains values below the privacy threshold, the + central part will show a range instead of the actual value. For example, if the + privacy threshold is 5, and the two nodes included in an analysis report back 0-4 + and 11, the central part will show 11-15 as the value for the corresponding cell. +- If one node does not contain a cell that another node do have, the central part will + simply count zero for the missing cell, i.e. the central part will simply show the + sum of the values that *are* reported. diff --git a/docs/v6-crosstab-py/privacy.rst b/docs/v6-crosstab-py/privacy.rst index cb3a2df..9165a61 100644 --- a/docs/v6-crosstab-py/privacy.rst +++ b/docs/v6-crosstab-py/privacy.rst @@ -1,17 +1,67 @@ Privacy ======= +.. _privacy-guards: + Guards ------ -.. What have you done to protect your users' privacy? E.g. threshold on low counts, -.. noise addition, etc. +There are several guards in place to protect sharing too much information on individual +records: + +- **Thresholding**: The system will only share information if there are at least `n` + records in the group. This is to prevent sharing information on individual records. + By default, the threshold is set to 5 records. Node administrators can change this + threshold by adding the following to their node configuration file: + + .. code:: yaml + + algorithm_env: + CROSSTAB_PRIVACY_THRESHOLD: 5 + + and setting the value to the desired threshold. This configuration will ensure that + an environment variable `CROSSTAB_PRIVACY_THRESHOLD` is set to the desired threshold + and passed to the algorithm container. + + Note that the algorithm also requires at least one field of the contingency table to + pass the threshold. This is to prevent that if a task is created for a column that + contains only unique values, the result would reveal which unique values are present + in the column. + +- **Minimum number of data rows to participate**: A node will only participate if it + contains at least `n` data rows. This is to prevent nodes with very little data from + participating in the computation. By default, the minimum number of data rows is set + to 5. Node administrators can change this minimum by adding the following to their + node configuration file: + + .. code:: yaml + + algorithm_env: + CROSSTAB_MINIMUM_ROWS_TOTAL: 5 + +- **Not allowing zero values**: By default, the system will not values of zero to be shared. + In principle, it should be OK to share zero values, since this only confirms an + absence of certain combinations of values. However, it may be possible to infer + information from zero values. For example, for a rather sparse contingency table, + the information which combinations exist at a certain data is more valuable than for + a dense table. It is therefore possible not to share this information by not sharing + zero values. + + If you do wish to share zero values, you can add the following to your node + configuration: + + .. code:: yaml + + algorithm_env: + CROSSTAB_ALLOW_ZERO: true + Data sharing ------------ -.. which data is shared between the parties? E.g. for an average, sum and total count -.. are shared. +The only intermediate data that is shared, are the local contingency tables. These +are formatted in the same way as the final result, but contain only the data from +the local node. The risk of sharing this data is low, as it concerns aggregated data. Vulnerabilities to known attacks -------------------------------- @@ -27,11 +77,12 @@ Vulnerabilities to known attacks - Risk eliminated? - Risk analysis * - Reconstruction - - ⚠ - - May happen if ... + - ✔ + - * - Differencing - ❌ - - Possible by doing A then B... + - May be possible by making smart selection with preprocessing, or by sending + multiple tasks before and after data is updated. * - Deep Leakage from Gradients (DLG) - ✔ - diff --git a/docs/v6-crosstab-py/references.rst b/docs/v6-crosstab-py/references.rst index 9c45aab..8281e57 100644 --- a/docs/v6-crosstab-py/references.rst +++ b/docs/v6-crosstab-py/references.rst @@ -1,5 +1,10 @@ References ========== -.. If applicable, include references to papers, books, or other documents - for further reading. \ No newline at end of file +This particular algorithm has not been published yet. If you use this code in your +research, please cite the following paper: + +1. Moncada-Torres, Arturo, et al. "VANTAGE6: an open source priVAcy preserviNg federaTed + leArninG infrastructurE for Secure Insight eXchange." *AMIA annual symposium proceedings.* + Vol. 2020. American Medical Informatics Association, 2020. + `[link] `_ diff --git a/docs/v6-crosstab-py/usage.rst b/docs/v6-crosstab-py/usage.rst index b55b42e..3ec22a3 100644 --- a/docs/v6-crosstab-py/usage.rst +++ b/docs/v6-crosstab-py/usage.rst @@ -4,8 +4,22 @@ How to use Input arguments --------------- -.. describe the input arguments: -.. ['organizations_to_include'] +Input arguments +--------------- + +.. list-table:: + :widths: 20 80 + :header-rows: 1 + + * - Argument + - Description + * - ``results_col`` + - The column whose categories will be the columns of the contingency table. + * - ``group_cols`` + - One or more columns whose categories, or combinations of categories, will be the + rows of the contingency table. + * - ``organizations_to_include`` + - Which organizations to include in the computation. Python client example --------------------- @@ -15,7 +29,47 @@ framework. If you are not, please read the `documentation `_. -.. TODO Some explanation of the code below +Let's say you want to know how many males and females are overweight in different age +groups, e.g. something like: + +.. list-table:: + :widths: 20 20 20 20 + :header-rows: 1 + + * - AgeGroup + - isOverweight + - Male + - Female + * - 0-18 + - True + - 11 + - 6 + * - 0-18 + - False + - 30 + - 29 + * - 18-65 + - True + - 55 + - 44 + * - 18-65 + - False + - 50 + - 56 + * - 65+ + - True + - 5 + - 10 + * - 65+ + - False + - 15 + - 14 + + +Such a result could be obtained by running the following Python client code. Note that +``AgeGroup``, ``isOverweight``, and ``Gender`` should be categorical values in your +dataset, and that you should replace the values at the top to authenticate with your +vantage6 server. .. code-block:: python @@ -34,23 +88,23 @@ first, especially the part about the client.authenticate(username, password) input_ = { - 'master': True, 'method': 'central_crosstab', - 'args': [], 'kwargs': { - 'organizations_to_include': 'my_value', - }, - 'output_format': 'json' + 'results_col': 'Gender', + 'group_cols': ["AgeGroup", "isOverweight"] + } } my_task = client.task.create( collaboration=1, organizations=[1], - name='v6-crosstab-py', + name='Compute contingency table', description='Create a contingency table showing the relationship between two or more variables', - image='harbor2.vantage6.ai/algorithms/v6-crosstab-py:v4', + image='harbor2.vantage6.ai/algorithms/v6-crosstab-py:latest', input=input_, - data_format='json' + databases=[ + {'label': 'default'} + ] ) task_id = my_task.get('id') diff --git a/docs/v6-crosstab-py/validation.rst b/docs/v6-crosstab-py/validation.rst index 40b8bb8..629e82b 100644 --- a/docs/v6-crosstab-py/validation.rst +++ b/docs/v6-crosstab-py/validation.rst @@ -1,5 +1,11 @@ Validation ========== -.. Describe how the algorithm has been tested and how a user may test the algorithm -.. themselves (if applicable). \ No newline at end of file +A `test script `_ is +available in the `test` directory. It can be run with the following command: + +.. code-block:: bash + + python test/test.py + +The script will run the crosstab algorithm via the vantage6 ``MockAlgorithmClient``.