diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 12407468..3cb33054 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -21,6 +21,7 @@ jobs: echo 'jinja2==3.0.0' >> requirements.txt echo 'mkdocstrings==0.16.2' >> requirements.txt echo 'mkdocs-macros-plugin==0.6.0' >> requirements.txt + echo 'matplotlib==3.3.1' >> requirements.txt echo '.' >> requirements.txt - name: Deploy docs diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index b2dbbd3a..045168af 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -26,7 +26,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install biopython pathos pytest psutil six + pip install biopython pathos pytest psutil six matplotlib - name: Run Unit Tests run: | diff --git a/CHANGELOG.md b/CHANGELOG.md index 70ca8c9d..39f55090 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - `callAltTranslation` added to call peptides with alternative translation without any genomic or transcriptomic variations. +- Enabled `summarizeFasta` to create bar plot of the summary results. + ### Fixed - Fixed `fake` that simulated selenocysteine positions could be in introns. diff --git a/Dockerfile b/Dockerfile index 877fb7a5..8150601f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,12 +4,13 @@ COPY . /opt/moPepGen ARG PYTHON_VER=3.8.11 ARG BIOPYTHON_VER=1.79 +ARG MATPLOTLIB_VER=3.3.1 RUN conda create -qy -p /usr/local\ python==${PYTHON_VER} RUN cd /opt/moPepGen/ && \ - pip install . biopython==${BIOPYTHON_VER} + pip install . biopython==${BIOPYTHON_VER} matplotlib==${MATPLOTLIB_VER} # Deploy the target tools into a base image FROM ubuntu:20.04 diff --git a/moPepGen/aa/PeptidePoolSummarizer.py b/moPepGen/aa/PeptidePoolSummarizer.py index 9d52b4a9..08f3c087 100644 --- a/moPepGen/aa/PeptidePoolSummarizer.py +++ b/moPepGen/aa/PeptidePoolSummarizer.py @@ -1,7 +1,9 @@ """ module for peptide pool summarizer """ from __future__ import annotations import itertools -from typing import Dict, IO, List, Set, FrozenSet, Tuple +import statistics +from typing import Dict, IO, List, Set, FrozenSet, Tuple, Optional +import matplotlib.pyplot as plt from moPepGen import gtf, seqvar from moPepGen.aa.AminoAcidSeqRecord import AminoAcidSeqRecord from moPepGen.aa.VariantPeptideLabel import VariantPeptideInfo, \ @@ -48,6 +50,13 @@ ] } +# BPF default color scheme. +# Hopefully no one is using more than 11 miscleavages +COLOR_SCHEME = [ + "#FFA500", "#458B00", "#68228B", "#FFD700", "#1E90FF", "#CD2626", "#9ACD32" + "#FF7F00", "#473C8B", "#43CD80", "#CD3278", "#00C5CD" +] + class NoncanonicalPeptideSummaryTable(): """ Summary table for noncaonincal peptides @@ -231,3 +240,65 @@ def write_summary_table(self, handle:IO): key = frozenset(comb) record = self.summary_table.get_stringified_summary_entry(key) handle.write(record + '\n') + + def create_barplot(self, width:float=6, height:float=8, ax:Optional[plt.Axes]=None, + scale:str=None) -> plt.Axes: + """ Make a barplot of the summarized data. """ + # misc -> source -> count + data:Dict[int,List[int]] = {} + keys:List[str] = [] + sources = [it[0] for it in sorted(self.order.items(), key=lambda x:x[1])] + for i in range(len(sources)): + for comb in itertools.combinations(sources, i + 1): + if self.ignore_missing_source: + # Ignore it if the source isn't present in any GVF given. + if any(not self.summary_table.has_source(k) for k in comb): + continue + if self.contains_exclusive_sources(comb): + continue + key = frozenset(comb) + if key not in self.summary_table.data: + continue + + keys.append('-'.join(key)) + for x in range(self.summary_table.max_misc + 1): + n = self.summary_table.get_n_x_misc(key, x) + if x not in data: + data[x] = [] + data[x].append(n) + + totals = [] + for x in data.values(): + if not totals: + totals = x + else: + totals = [i+j for i,j in zip(totals, x)] + mean = statistics.mean(totals) + median = statistics.median(totals) + + # Unless specified, log scale is used if the summary distribution is skewed. + if scale not in ['normal', 'log']: + scale = 'normal' if mean / median <= 2.5 else 'log' + + if ax is None: + _, ax = plt.subplots(figsize=(width, height)) + + if scale == 'log': + ax.barh( + y=list(reversed(keys)), width=list(reversed(totals)) + ) + ax.set_xscale('log') + else: + offset = None + for i,n in enumerate(data.keys()): + vals = list(reversed(data[n])) + if offset: + vals = [x + y for x,y in zip(vals, offset)] + ax.barh( + y=list(reversed(keys)), width=vals, left=offset, label=n, + color=COLOR_SCHEME[i] + ) + offset = vals + ax.legend(title='Miscleavages') + ax.set_xlabel('Number of Peptides') + return ax diff --git a/moPepGen/cli/summarize_fasta.py b/moPepGen/cli/summarize_fasta.py index 27d24dff..61cf6d0f 100644 --- a/moPepGen/cli/summarize_fasta.py +++ b/moPepGen/cli/summarize_fasta.py @@ -8,6 +8,7 @@ from pathlib import Path import sys from typing import IO +import matplotlib.pyplot as plt from moPepGen.cli import common from moPepGen.aa.PeptidePoolSummarizer import PeptidePoolSummarizer @@ -15,6 +16,7 @@ GVF_FILE_FORMAT = ['.gvf'] FASTA_FILE_FORMAT = ['.fasta', '.fa'] OUTPUT_FILE_FORMATS = ['.txt', 'tsv'] +OUTPUT_IMAGE_FORMATS = ['.pdf', '.jpg', '.jpeg', '.png'] # pylint: disable=W0212 @@ -70,11 +72,30 @@ def add_subparser_summarize_fasta(subparser:argparse._SubParsersAction): metavar='', default=None ) + p.add_argument( + '--output-image', + type=Path, + help=f"File path to the output barplot. Valid formats: {OUTPUT_IMAGE_FORMATS}", + metavar='', + default=None + ) p.add_argument( '--ignore-missing-source', action='store_true', help='Ignore the sources missing from input GVF.' ) + group_plot_scale = p.add_mutually_exclusive_group() + group_plot_scale.add_argument( + '--plot-normal-scale', + action='store_true', + help='Draw the summary bar plot in normal scale.' + ) + group_plot_scale.add_argument( + '--plot-log-scale', + action='store_true', + help='Draw the summary bar plot in log scale.' + ) + common.add_args_cleavage(p, enzyme_only=True) common.add_args_reference(p, genome=False, proteome=True) common.add_args_quiet(p) @@ -102,9 +123,14 @@ def summarize_fasta(args:argparse.Namespace) -> None: args.variant_peptides, FASTA_FILE_FORMAT, check_readable=True ) - common.validate_file_format( - args.output_path, OUTPUT_FILE_FORMATS, check_writable=True - ) + if args.output_path is not None: + common.validate_file_format( + args.output_path, OUTPUT_FILE_FORMATS, check_writable=True + ) + if args.output_image is not None: + common.validate_file_format( + args.output_image, OUTPUT_IMAGE_FORMATS, check_writable=True + ) common.print_start_message(args) @@ -142,3 +168,14 @@ def summarize_fasta(args:argparse.Namespace) -> None: with output_context(args.output_path) as handle: summarizer.write_summary_table(handle) + + if args.output_image: + if args.plot_log_scale: + scale = 'log' + elif args.plot_normal_scale: + scale = 'normal' + else: + scale = None + fig, ax = plt.subplots(figsize=(8, 8)) + summarizer.create_barplot(ax=ax, scale=scale) + fig.savefig(args.output_image, bbox_inches="tight") diff --git a/setup.cfg b/setup.cfg index 01bfe5ad..08656dae 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,6 +27,7 @@ install_requires = psutil pathos six + matplotlib [options.packages.find] where = . diff --git a/test/integration/test_summarize_fasta.py b/test/integration/test_summarize_fasta.py index 689f9687..2357ee80 100644 --- a/test/integration/test_summarize_fasta.py +++ b/test/integration/test_summarize_fasta.py @@ -17,6 +17,7 @@ def create_base_args(self) -> argparse.Namespace: args.order_source = None args.cleavage_rule = 'trypsin' args.output_path = self.work_dir/'output.txt' + args.output_image = None args.ignore_missing_source = False args.reference_source = None return args