run_permutation_test.py

"""
Description:
This script iterates through a directory of network files
generated from the `run_network_creation.py` script and applies a
permutation test to the networks.

If there is more than 1 network file in the directory,
the networks are combined to make a single aggregate network.

Edges that are significant under their corresponding nulls
(generated by the permutation test) are kept in the resulting
final (aggregate, if applicable) network.

Currently, defaults are set up for the eADAGE analysis provided in
the PathCORE-T paper.

Output:
    - <significant-edges-file> [default: significant_edges.tsv]
      significant edges with the following columns:
        pw0, pw1, p-value, q-value (FDR-corrected p-value), odds ratio
      where pw0 (pathway 0) and pw1 (pathway 1) are the two pathways
      that share an edge.

    - <filtered-network-file> [default: filtered_network.tsv]
      Network file that only contains the edges
      specified in `significant_edges.tsv.`
      Edge weight is updated to be an odds ratio:
         observed / average expected weight
      If this is an aggregate network, the feature column
      should be empty.
Both file names can be changed through command-line arguments
(see documentation)

Usage:
    run_permutation_test.py
        <significant-pws-dir> <output-dir>
        [--n-permutations=<n-permutations>]
        [--n-features=<n-features>] [--n-cores=<n-cores>]
        [--alpha=<alpha>]
        [--edges-out=<edges-file>]
        [--network-out=<network-file>]
        [--shorten=<pathway-shorten>]
        [--random-seed=<random-seed>]
    run_permutation_test.py -h | --help

Options:
    -h --help                         Show this screen

    <significant-pws-dir>             Path to the directory containing the
                                      significant pathways files generated by
                                      running `build_co_occurrence_network.py`

    <output-dir>                      Path to the directory that will store
                                      the output files. Will be automatically
                                      created if the directory does not
                                      currently exist

    --n-permutations=<n-permutations> Number of permutations generated for a
                                      network
                                      [default: 10000]

    --n-features=<n-features>         Number of constructed features in each
                                      model
                                      [default: 300]

    --n-cores=<n-cores>               Number of cores used to run the analysis
                                      on models in parallel
                                      [default: num. available cores - 1]

    --alpha=<alpha>                   Significance level for pathway
                                      enrichment. Overrepresentation is
                                      determined by a Fisher's exact test
                                      with false-discovery rate correction
                                      Benjamini-Hochberg (1995, 2000)
                                      [default: 0.05]

    --edges-out=<edges-file>          Filename to store the edges that are
                                      significant after the permutation test
                                      [default: significant_edges.tsv]

    --network-out=<network-file>      Filename to store the (aggregate)
                                      network containing only the significant
                                      edges
                                      [default: filtered_network.tsv]

    --random-seed=<random-seed>       Set the random state for the
                                      permutation test
                                      [default: 640]
"""
from copy import deepcopy
import multiprocessing
import os
import random
from time import time

from docopt import docopt
from joblib import Parallel, delayed
from pathcore import CoNetwork
from pathcore import aggregate_permuted_network, \
        network_edges_permutation_test


if __name__ == "__main__":
    arguments = docopt(
        __doc__, version="run permutation test 1.0")
    significant_pathways_directory = arguments["<significant-pws-dir>"]
    output_directory = arguments["<output-dir>"]

    n_permutations = int(arguments["--n-permutations"])
    n_features = int(arguments["--n-features"])

    alpha = float(arguments["--alpha"])

    edges_filename = arguments["--edges-out"]
    final_network_filename = arguments["--network-out"]

    random_seed = int(arguments["--random-seed"])
    random.seed(random_seed)

    os.makedirs(arguments["<output-dir>"], exist_ok=True)

    if arguments["--n-cores"].isdigit():
        n_cores = int(arguments["--n-cores"])
    else:
        n_cores = multiprocessing.cpu_count() - 1

    t_o = time()
    aggregate_observed_network = None
    individual_networks = []
    for filename in os.listdir(significant_pathways_directory):
        path_to_significant_pathways_file = os.path.join(
          significant_pathways_directory, filename)
        if os.path.isdir(path_to_significant_pathways_file):
            continue
        network_object = CoNetwork(
          n_features,
          significant_pathways=path_to_significant_pathways_file)
        individual_networks.append(network_object)
        if not aggregate_observed_network:
            aggregate_observed_network = deepcopy(network_object)
        else:
            aggregate_observed_network.aggregate(network_object)

    with Parallel(n_jobs=n_cores) as parallel:
        permutations = parallel(
            delayed(aggregate_permuted_network)(individual_networks)
            for _ in range(n_permutations))
    t_f = time() - t_o

    n_networks = len(individual_networks)

    print("{0} permutations of {1} network(s) took "
          "{2} seconds to run on {3} core(s).".format(
              n_permutations, n_networks, t_f, n_cores))

    # this function returns a CoNetwork object. because the network is
    # also written to `arguments["--network-out"]`, we ignore the
    # return value in this context.
    network_edges_permutation_test(
        aggregate_observed_network, permutations, alpha,
        n_networks=n_networks,
        output_edges_to_file=os.path.join(
          output_directory, arguments["--edges-out"]),
        output_network_to_file=os.path.join(
          output_directory, arguments["--network-out"]))