diff --git a/bin/gtftk b/bin/gtftk index c9e0b683..5e6b57e0 100644 --- a/bin/gtftk +++ b/bin/gtftk @@ -1,5 +1,11 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- + + +# Avoid AttributeError: module '__main__' has no attribute '__spec__'. +# A weird bug which seems to be associated to the calling prgm (e.g. spider) +__spec__ = "" + import hashlib import os import shutil @@ -24,7 +30,6 @@ from pygtftk.utils import flatten_list from pygtftk.utils import message from pygtftk.utils import silentremove from pygtftk.version import __version__ -from pygtftk.bwig.bw_coverage import TMP_FILE_POOL_MANAGER # Avoid warning message emitted by numpy # https://tinyurl.com/ybev6zrw @@ -93,6 +98,7 @@ def main(): if __name__ == "__main__": + from pygtftk.bwig.bw_coverage import TMP_FILE_POOL_MANAGER from signal import signal, SIGPIPE, SIG_DFL signal(SIGPIPE, SIG_DFL) diff --git a/changelog.md b/changelog.md index d9dc925a..167be48a 100644 --- a/changelog.md +++ b/changelog.md @@ -1,11 +1,10 @@ # Changelog -## v1.1.5 - +## v1.2.1 ### Bug Fixes -* Fix #128 (merge_attr error when using same key as source and destination). +* Fix multiprocessing issue with py3.8. ### API/CLI Changes @@ -13,14 +12,13 @@ ### Code changes -* None. +* Updated requirements.txt according to python 3.8. ### New Features * None. - -## v1.1.5 +## v1.2.0 This version introduces OLOGRAM-MODL, a new paradigm for OLOGRAM to find intersections between multiple sets of genomic regions at once and then compute their enrichment with OLOGRAM. An optional algorithm (MODL) to find interesting combinations with sparse dictionary learning and greedy submodular optimisation has also been added. Furthermore, it also contains major speedups to OLOGRAM itself. @@ -34,6 +32,7 @@ This version introduces OLOGRAM-MODL, a new paradigm for OLOGRAM to find interse * fix #124 * fix BED to BED convertion in arg_formatted.FormattedFile(). BED6+ files were considered as BED6- files. * fix #136 although --show-group-number is no more supported with gtftk profile when plotnine > 0.6.0 is used. +* Fix #128 (merge_attr error when using same key as source and destination). ### API Changes @@ -58,8 +57,6 @@ This version introduces OLOGRAM-MODL, a new paradigm for OLOGRAM to find interse * Introduced a *treeify_ologram_modl* plugin to visualize n-wise enrichment results as a treee * Introduced a *ologram_merge_runs* command to merge several runs to save RAM, treating each as a superbatch. - - ## v1.1.4 ### Bug Fixes @@ -99,7 +96,6 @@ This version introduces OLOGRAM-MODL, a new paradigm for OLOGRAM to find interse ## v1.1.2 - ### Bug Fixes * None @@ -116,7 +112,6 @@ This version introduces OLOGRAM-MODL, a new paradigm for OLOGRAM to find interse * The --more-bed-labels is now facultative in OLOGRAM. - ## v1.1.1 ### Bug Fixes @@ -140,7 +135,6 @@ This version introduces OLOGRAM-MODL, a new paradigm for OLOGRAM to find interse ## v1.1.0 - ### Bug Fixes * None. @@ -158,10 +152,8 @@ This version introduces OLOGRAM-MODL, a new paradigm for OLOGRAM to find interse * Support for Python 3.7. * The tss_numbering command now allows to add the number of different TSSs to the gene feature. - ## v1.0.9 - ### Bug Fixes * None. @@ -187,7 +179,6 @@ Enrichment of Annotations Tool) documentation. We can not warrant that the proce for more details. * Added -y/--display-fit-quality to ologram - ## v1.0.8 This version introduces *ologram_merge_stats* command that can be used to produce a heatmap from multiple OLOGRAM results. @@ -228,7 +219,6 @@ This version contains some minor code refactoring. See 1.0.6 for recent major ch * None. - ## v1.0.6 ### Bug Fixes diff --git a/docs/_images/example_01.png b/docs/_images/example_01.png index 08a728f5..9d6fe4a4 100644 Binary files a/docs/_images/example_01.png and b/docs/_images/example_01.png differ diff --git a/docs/_images/example_05.png b/docs/_images/example_05.png index 3f6c33d6..2937ac10 100644 Binary files a/docs/_images/example_05.png and b/docs/_images/example_05.png differ diff --git a/docs/_images/example_06.png b/docs/_images/example_06.png index 48f30732..79f3e737 100644 Binary files a/docs/_images/example_06.png and b/docs/_images/example_06.png differ diff --git a/docs/_images/example_06b.png b/docs/_images/example_06b.png index eb7ed09e..3cbeb7c0 100644 Binary files a/docs/_images/example_06b.png and b/docs/_images/example_06b.png differ diff --git a/docs/_images/example_08.png b/docs/_images/example_08.png index a2a823dd..861d1066 100644 Binary files a/docs/_images/example_08.png and b/docs/_images/example_08.png differ diff --git a/docs/_images/example_13.png b/docs/_images/example_13.png index 67fc2186..1fde3047 100644 Binary files a/docs/_images/example_13.png and b/docs/_images/example_13.png differ diff --git a/docs/_sources/ologram.rst.txt b/docs/_sources/ologram.rst.txt index 67e17707..631dfbde 100644 --- a/docs/_sources/ologram.rst.txt +++ b/docs/_sources/ologram.rst.txt @@ -54,7 +54,7 @@ The program will return statistics for both the number of intersections and the - H1: The regions of the query (--peak-file) tend to overlap the reference (--inputfile or --more-bed). -.. warning:: The ologram examples below use 8 CPUs. Please adapt. +.. warning:: The ologram examples below use 8 CPUs. Please adapt the number of threads. @@ -167,33 +167,16 @@ The program will return statistics for both the number of intersections and the ologram (multiple overlaps) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -While previously we computed paiwise enrichment (ie. Query+A, Query+B ...) , It is also possible to use the **OLOGRAM-MODL** Multiple Overlap Dictionary Learning) plugin to find multiple overlaps (ie. between n>=2 sets) enrichment (ie. Query+A+B, Query+A+C, ...) in order to highlight combinations of genomic regions, such as Transcriptional Regulator complexes. +While previously we computed paiwise enrichment (ie. Query+A, Query+B, ...) , it is also possible to use the **OLOGRAM-MODL** Multiple Overlap Dictionary Learning) plugin to find multiple overlaps (ie. between n>=2 sets) enrichment (ie. Query+A+B, Query+A+C, ...) in order to highlight combinations of genomic regions, such as Transcriptional Regulator complexes. This is done only on custom regions supplied as BEDs supplied with the `--more-bed` argument. In most cases you may use the --no-gtf argument and only pass the regions of interest. - -For statistical reasons, we recommend shuffling across a relevant subsection of the genome only (ie. enhancers only) using --bed-excl or --bed-incl to ensure the longer combinations have a reasonable chance of being randomly encountered in the shuffles. - - -**MODL itemset mining algorithm:** By default, OLOGRAM-MODL will compute the enrichment of all n-wise combinations that are encountered in the real data it was passed. This however can add up to 2**N combinations and make the result hard to read. Furthermore, in biological data noise is a real problem and can obscure the relevant combinations. - -As such, we also give the option to use a custom itemset mining algorithm on the true overlaps to identify interesting combinations. - -In broad strokes, this custom algorithm MODL (Multiple Overlap Dictionary Learning) will perform many matrix factorizations on the matrix of true overlaps to identify relevant correlation groups of genomic regions. Then a greedy algorithm based on how much these words improve the reconstruction will select the utmost best words. MODL is only used to filter the output of OLOGRAM : once it returns a list of interesting combination, OLOGRAM will compute their enrichment as usual, but for them only. Each combination is of the form [Query + A + B + C] where A, B and C are BED files given as --more-bed. You can also manually specify the combinations to be studied with the format defined in OLOGRAM notes (below). - -Unlike classical association rules mining algorithms, this focuses on mining relevant bio complexes/clusters and correlation groups (item sets), and you should not request more than 20-30 combinations. As a matrix factorization based algorithm, it is designed to be resistant -to noise which is a known problem in biological data. Its goal is to extract meaningful frequent combinations from noisy data. As a result however, it is biased in favor of the most abundant combinations in the data, and may return correlation groups if you ask for too few words (ie. if AB, BC and AC are complexes, ABC might be returned). - - -This itemset mining algorithm is a work-in-progress. Whether you use MODL will not change the results for each combination, it only changes which combinations are displayed. If you want the enrichment of all combinations, ignore it. To use MODL, use the --multiple-overlap-max-number-of-combinations argument. - +For statistical reasons, we recommend shuffling across a relevant subsection of the genome only (ie. enhancers only) using --bed-excl or --bed-incl to ensure the longer combinations have a reasonable chance of being randomly encountered in the shuffles. Conversely, if you do not filter the combinations, keep in mind that the longer ones may be enriched even though they are present only on a few base pairs, because at random they would be even rarer. **Exact combinations:** By default, OLOGRAM will compute "inexact" combinations, meaning that when encountering an overlap of [Query + A + B + C] it will count towards [A + B + ...]. For exact intersections (ie. [Query + A + B + nothing else]), set the --multiple-overlap-target-combi-size flag to the number of --more-bed plus one. You will know if the combinations are computed as inexact by the '...' in their name in the result file. Intersections not including the query file are discarded. - - **Simple example:** Comparing the query (-p) against two other BED files, analyzing multiple overlaps. @@ -238,19 +221,30 @@ Comparing the query (-p) against two other BED files, analyzing multiple overlap As the computation of multiple overlaps can be RAM-intensive, if you have a very large amount of candidate genomic feature sets (hundreds) we recommend selecting less candidates among them first by running a pairwise analysis. -**MODL algorithm API:** MODL can also be used independantly as a combination mining algorithm. -This can work on any type of data, biological or not, that respects the conventional formatting for lists of transactions: the data needs to be a matrix with one line per transaction and one column per element. +**MODL itemset mining algorithm:** By default, OLOGRAM-MODL will compute the enrichment of all n-wise combinations that are encountered in the real data it was passed. This however can add up to 2**N combinations and make the result hard to read. Furthermore, in biological data noise is a real problem and can obscure the relevant combinations. As such, we also give the option to use a custom itemset mining algorithm on the true overlaps to identify interesting combinations. + + + +Details +----------------- -For example, if you have three possible elements A, B and C, a line of [1,0,1] means a transaction containing A and C. -For a factor allowance of k and n final queried words, the matrix will be rebuilt with k*n words in step 1. -factor allowance is K in K*n words in step 1 where n is final queries nb of words. +In broad strokes, the custom itemset algorithm MODL (Multiple Overlap Dictionary Learning) will perform many matrix factorizations on the matrix of true overlaps to identify relevant correlation groups of genomic regions. Then a greedy algorithm based on how much these words improve the reconstruction will select the utmost best words. MODL is only used to filter the output of OLOGRAM : once it returns a list of interesting combination, OLOGRAM will compute their enrichment as usual, but for them only. Each combination is of the form [Query + A + B + C] where A, B and C are BED files given as --more-bed. You can also manually specify the combinations to be studied with the format defined in OLOGRAM notes (below). -MODL and will discard combinations rarer than 1/10000 occurences to reduce computing times and will also reduce the abundance of all unique lines in the matrix to their square roots to reduce the emphasis on the most frequent elements. -However, this can magnify the impact of the noise quadratically as well, and can be disabled when using the manual API. +Unlike classical association rules mining algorithms, this focuses on mining relevant bio complexes/clusters and correlation groups (item sets), and you should not request more than 20-30 combinations. As a matrix factorization based algorithm, it is designed to be resistant +to noise which is a known problem in biological data. Its goal is to extract meaningful frequent combinations from noisy data. As a result however, it is biased in favor of the most abundant combinations in the data, and may return correlation groups if you ask for too few words (ie. if AB, BC and AC are complexes, ABC might be returned). + + +This itemset mining algorithm is a work-in-progress. Whether you use MODL will not change the results for each combination, it only changes which combinations are displayed. If you want the enrichment of all combinations, ignore it. To use MODL, use the --multiple-overlap-max-number-of-combinations argument. + + + +**MODL algorithm API:** MODL can also be used independantly as a combination mining algorithm. +This can work on any type of data, biological or not, that respects the conventional formatting for lists of transactions: the data needs to be a matrix with one line per transaction and one column per element. For example, if you have three possible elements A, B and C, a line of [1,0,1] means a transaction containing A and C. +For a factor allowance of k and n final queried words, the matrix will be rebuilt with k*n words in step 1. MODL will discard combinations rarer than 1/10000 occurences to reduce computing times. It will also reduce the abundance of all unique lines in the matrix to their square roots to reduce the emphasis on the most frequent elements. However, the latter can magnify the impact of the noise as well and can be disabled when using the manual API. To de-emphasize longer words, which can help in this case, we can also normalize words by their summed square in step 2. If you are passing a custom error function, it must have the signature error_function(X_true, X_rebuilt, code). X_true is the real data, X_rebuilt is the reconstruction to evaluate, and code is the encoded version which in our case is used to assess sparsity. All are NumPY matrices. @@ -272,11 +266,12 @@ Here is an example: nb_threads = 1, step_1_factor_allowance = 2, # How many words to ask for in each step 1 rebuilding, as a multiplier of multiple_overlap_max_number_of_combinations error_function = None, # Custom error function in step 2 - smother = True) # Should the smothering (quadratic reduction of abundance) be applied ? + smother = True, # Should the smothering (quadratic reduction of abundance) be applied ? + normalize_words = False) # Normalize words by their summed squared in step 2 ? interesting_combis = combi_miner.find_interesting_combinations() -For more details about usage and implementation, please read the notes below : +For more details about usage and implementation, please read the notes below. **Arguments:** @@ -284,7 +279,27 @@ For more details about usage and implementation, please read the notes below : :shell: -Since the results of MODL only depend on the true intersections and not on the shuffles, you can run MODL with 1 shuffle to pre-select interesting combinations, and then run the full analysis on many shuffles. We then recommend selecting the combinations that interest you in the resulting tsv, using MODL's selection as a starting point, and adding or removing some combinations based on your own needs (eg. adding all the highest fold changes, or all particular combinations containing the Transcription Factor X that you are studying). Then, run ologram_modl_treeify on the resulting filtered tsv. + +**Manual intersection computing:** To manually compute an overlap matrix between any number of BED files, the following Python code can be used. + +.. code-block:: python + + import pybedtools + import numpy as np + from pygtftk.stats.intersect.overlap_stats_compute import compute_true_intersection + + # Register the BED files as pybedtools.BedTool objects + bedA = pybedtools.BedTool(path_to_your_query) + bedsB = [pybedtools.BedTool(bedfilepath) for bedfilepath in list_of_all_paths_to_more_bed] + + # Use our custom intersection computing algorithm to get the matrix of overlaps + true_intersection = compute_true_intersection(bedA, bedsB) + flags_matrix = np.array([i[3] for i in true_intersection]) + +The resulting flags_matrix is a NumPy array that can be edited, and on which MODL can be run. + +Since the results of MODL only depend on the true intersections and not on the shuffles, you can run MODL with 1 shuffle or on a manually computed matrix as above to pre-select interesting combinations, and then run the full analysis on many shuffles. We then recommend selecting the combinations that interest you in the resulting tsv file, using MODL's selection as a starting point and adding or removing some combinations based on your own needs (eg. adding all the highest fold changes, or all particular combinations containing the Transcription Factor X that you are studying). + ologram_merge_stats @@ -325,11 +340,13 @@ This also works with OLOGRAM-MODL results, since they follow the same basic form ologram_modl_treeify ~~~~~~~~~~~~~~~~~~~~~~ -**Description:** Visualize n-wise enrichment results (OLOGRAM-MODL) as a tree of combinations. Works on the result (tsv file) of an OLOGRAM analysis called with --more-bed-multiple-overlap. +**Description:** Visualize n-wise enrichment results (OLOGRAM-MODL) as a tree of combinations. Works on the result (tsv file) of an OLOGRAM analysis called with --more-bed-multiple-overlap. On the graph, S designated the total number of basepairs in which this combinations is encountered in the real data. Fold change gives the ratio with the number of basepairs in the shuffles, with the associated Negative Binomial p-value. + +This recommended representation is useful to find master regulators, by showing which additions to a combinations increase its enrichment, and allowing to see whether overlaps that contain the element X also contain the element Y (looking at how a child combination accounts for the S of its parent in an inexact counting). -We recommend this representation. The tsv file can be edited before passing it to the command, for example by keeping only the combinations you are interested in. +The tsv result file can be edited before passing it to the command, for example by keeping only the combinations you are interested in, such as all combinations containing the Transcription Factor you are studying. We recommend running MODL to make a pre-selection. -On the graph, S designated the total number of basepairs in which this combinations is encountered in the real data. Fold change gives the ratio with the number of basepairs in the shuffles, with the associated Negative Binomial p-value. +We also recommend discarding the rarest combinations found on such a very small number of basepairs that they are unlikely tobe biologically significant. This is mostly relevant when you have many sets (k >= 5) since longer combinations will often be enriched through sheer unlikelihood. .. command-output:: gtftk ologram_modl_treeify -i multiple_overlap_trivial_ologram_stats.tsv -o treeified.pdf -l ThisWasTheNameOfTheQuery :shell: diff --git a/docs/_static/documentation_options.js b/docs/_static/documentation_options.js index c1bb9ae4..bd0de63c 100644 --- a/docs/_static/documentation_options.js +++ b/docs/_static/documentation_options.js @@ -1,6 +1,6 @@ var DOCUMENTATION_OPTIONS = { URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), - VERSION: '1.2.1', + VERSION: '1.2.3', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/docs/_static/example_01.png b/docs/_static/example_01.png index 9d6fe4a4..0104648b 100644 Binary files a/docs/_static/example_01.png and b/docs/_static/example_01.png differ diff --git a/docs/_static/example_01b.png b/docs/_static/example_01b.png index cf809c40..1c5660bb 100644 Binary files a/docs/_static/example_01b.png and b/docs/_static/example_01b.png differ diff --git a/docs/_static/example_02.png b/docs/_static/example_02.png index 8439c50a..f15653b3 100644 Binary files a/docs/_static/example_02.png and b/docs/_static/example_02.png differ diff --git a/docs/_static/example_05.png b/docs/_static/example_05.png index 2937ac10..4702bf91 100644 Binary files a/docs/_static/example_05.png and b/docs/_static/example_05.png differ diff --git a/docs/_static/example_06.png b/docs/_static/example_06.png index 79f3e737..713d6c11 100644 Binary files a/docs/_static/example_06.png and b/docs/_static/example_06.png differ diff --git a/docs/_static/example_06b.png b/docs/_static/example_06b.png index 3cbeb7c0..ecfe0e0b 100644 Binary files a/docs/_static/example_06b.png and b/docs/_static/example_06b.png differ diff --git a/docs/_static/example_07.png b/docs/_static/example_07.png index aaefb731..6c312dd1 100644 Binary files a/docs/_static/example_07.png and b/docs/_static/example_07.png differ diff --git a/docs/_static/example_08.png b/docs/_static/example_08.png index 861d1066..952b4eaf 100644 Binary files a/docs/_static/example_08.png and b/docs/_static/example_08.png differ diff --git a/docs/_static/example_13.png b/docs/_static/example_13.png index 1fde3047..c5c7d0c2 100644 Binary files a/docs/_static/example_13.png and b/docs/_static/example_13.png differ diff --git a/docs/_static/example_pa_01.pdf b/docs/_static/example_pa_01.pdf index 54ea2927..bd4f566c 100644 Binary files a/docs/_static/example_pa_01.pdf and b/docs/_static/example_pa_01.pdf differ diff --git a/docs/_static/example_pa_02.pdf b/docs/_static/example_pa_02.pdf index ec941371..82426456 100644 Binary files a/docs/_static/example_pa_02.pdf and b/docs/_static/example_pa_02.pdf differ diff --git a/docs/_static/example_pa_03.pdf b/docs/_static/example_pa_03.pdf index 379b27d2..d0325285 100644 Binary files a/docs/_static/example_pa_03.pdf and b/docs/_static/example_pa_03.pdf differ diff --git a/docs/_static/example_pa_04.pdf b/docs/_static/example_pa_04.pdf index 8b24f3fa..9f37f661 100644 Binary files a/docs/_static/example_pa_04.pdf and b/docs/_static/example_pa_04.pdf differ diff --git a/docs/_static/merge_ologram_stats_01.pdf b/docs/_static/merge_ologram_stats_01.pdf index 72a77908..5d5e6e6e 100644 Binary files a/docs/_static/merge_ologram_stats_01.pdf and b/docs/_static/merge_ologram_stats_01.pdf differ diff --git a/docs/_static/treeified.pdf b/docs/_static/treeified.pdf index 465cbd87..dea2bfd4 100644 Binary files a/docs/_static/treeified.pdf and b/docs/_static/treeified.pdf differ diff --git a/docs/about.html b/docs/about.html index 1bd7dd9e..40bca68c 100644 --- a/docs/about.html +++ b/docs/about.html @@ -16,7 +16,7 @@ var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s); })(); -