From 02d04d303f0d316fdd0470af11f891f2c6d18eb8 Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Thu, 4 May 2023 16:37:03 +0800 Subject: [PATCH 1/4] feat (summarizeFasta): create a bar plot of the summary results --- CHANGELOG.md | 2 ++ moPepGen/aa/PeptidePoolSummarizer.py | 51 +++++++++++++++++++++++++++- moPepGen/cli/summarize_fasta.py | 25 ++++++++++++-- 3 files changed, 74 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c3c812d4..29327e11 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - `callAltTranslation` added to call peptides with alternative translation without any genomic or transcriptomic variations. +- Enabled `summarizeFasta` to create bar plot of the summary results. + ### Fixed - Fixed `fake` that simulated selenocysteine positions could be in introns. diff --git a/moPepGen/aa/PeptidePoolSummarizer.py b/moPepGen/aa/PeptidePoolSummarizer.py index 9d52b4a9..a290021c 100644 --- a/moPepGen/aa/PeptidePoolSummarizer.py +++ b/moPepGen/aa/PeptidePoolSummarizer.py @@ -1,7 +1,8 @@ """ module for peptide pool summarizer """ from __future__ import annotations import itertools -from typing import Dict, IO, List, Set, FrozenSet, Tuple +from typing import Dict, IO, List, Set, FrozenSet, Tuple, Optional +import matplotlib.pyplot as plt from moPepGen import gtf, seqvar from moPepGen.aa.AminoAcidSeqRecord import AminoAcidSeqRecord from moPepGen.aa.VariantPeptideLabel import VariantPeptideInfo, \ @@ -48,6 +49,13 @@ ] } +# BPF default color scheme. +# Hopefully no one is using more than 11 miscleavages +COLOR_SCHEME = [ + "#FFA500", "#458B00", "#68228B", "#FFD700", "#1E90FF", "#CD2626", "#9ACD32" + "#FF7F00", "#473C8B", "#43CD80", "#CD3278", "#00C5CD" +] + class NoncanonicalPeptideSummaryTable(): """ Summary table for noncaonincal peptides @@ -231,3 +239,44 @@ def write_summary_table(self, handle:IO): key = frozenset(comb) record = self.summary_table.get_stringified_summary_entry(key) handle.write(record + '\n') + + def create_barplot(self, width:float=6, height:float=8, ax:Optional[plt.Axes]=None + ) -> plt.Axes: + """ Make a barplot of the summarized data. """ + data:Dict[int,List[int]] = {} + keys:List[str] = [] + sources = [it[0] for it in sorted(self.order.items(), key=lambda x:x[1])] + for i in range(len(sources)): + for comb in itertools.combinations(sources, i + 1): + if self.ignore_missing_source: + # Ignore it if the source isn't present in any GVF given. + if any(not self.summary_table.has_source(k) for k in comb): + continue + if self.contains_exclusive_sources(comb): + continue + key = frozenset(comb) + if key not in self.summary_table.data: + continue + + keys.append('-'.join(key)) + for x in range(self.summary_table.max_misc + 1): + n = self.summary_table.get_n_x_misc(key, x) + if x not in data: + data[x] = [] + data[x].append(n) + + if ax is None: + _, ax = plt.subplots(figsize=(width, height)) + offset = None + for i,n in enumerate(data.keys()): + vals = list(reversed(data[n])) + if offset: + vals = [x + y for x,y in zip(vals, offset)] + ax.barh( + y=list(reversed(keys)), width=vals, left=offset, label=n, + color=COLOR_SCHEME[i] + ) + offset = vals + ax.set_xlabel('Number of Peptides') + ax.legend(title='Miscleavages') + return ax diff --git a/moPepGen/cli/summarize_fasta.py b/moPepGen/cli/summarize_fasta.py index 27d24dff..83fda2b4 100644 --- a/moPepGen/cli/summarize_fasta.py +++ b/moPepGen/cli/summarize_fasta.py @@ -6,6 +6,7 @@ import argparse from contextlib import contextmanager from pathlib import Path +import matplotlib.pyplot as plt import sys from typing import IO from moPepGen.cli import common @@ -15,6 +16,7 @@ GVF_FILE_FORMAT = ['.gvf'] FASTA_FILE_FORMAT = ['.fasta', '.fa'] OUTPUT_FILE_FORMATS = ['.txt', 'tsv'] +OUTPUT_IMAGE_FORMATS = ['.pdf', '.jpg', '.jpeg', '.png'] # pylint: disable=W0212 @@ -70,6 +72,13 @@ def add_subparser_summarize_fasta(subparser:argparse._SubParsersAction): metavar='', default=None ) + p.add_argument( + '--output-image', + type=Path, + help=f"File path to the output barplot. Valid formats: {OUTPUT_IMAGE_FORMATS}", + metavar='', + default=None + ) p.add_argument( '--ignore-missing-source', action='store_true', @@ -102,9 +111,14 @@ def summarize_fasta(args:argparse.Namespace) -> None: args.variant_peptides, FASTA_FILE_FORMAT, check_readable=True ) - common.validate_file_format( - args.output_path, OUTPUT_FILE_FORMATS, check_writable=True - ) + if args.output_path is not None: + common.validate_file_format( + args.output_path, OUTPUT_FILE_FORMATS, check_writable=True + ) + if args.output_image is not None: + common.validate_file_format( + args.output_image, OUTPUT_IMAGE_FORMATS, check_writable=True + ) common.print_start_message(args) @@ -142,3 +156,8 @@ def summarize_fasta(args:argparse.Namespace) -> None: with output_context(args.output_path) as handle: summarizer.write_summary_table(handle) + + if args.output_image: + fig, ax = plt.subplots(figsize=(8, 8)) + summarizer.create_barplot(ax=ax) + fig.savefig(args.output_image, bbox_inches="tight") From e866607f3229f864458f85416ca0af2722792cc8 Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Thu, 4 May 2023 16:40:05 +0800 Subject: [PATCH 2/4] fix (moPepGen): add matplotlib as dependency --- .github/workflows/docs.yaml | 1 + .github/workflows/tests.yaml | 2 +- Dockerfile | 3 ++- setup.cfg | 1 + 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 12407468..3cb33054 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -21,6 +21,7 @@ jobs: echo 'jinja2==3.0.0' >> requirements.txt echo 'mkdocstrings==0.16.2' >> requirements.txt echo 'mkdocs-macros-plugin==0.6.0' >> requirements.txt + echo 'matplotlib==3.3.1' >> requirements.txt echo '.' >> requirements.txt - name: Deploy docs diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index b2dbbd3a..045168af 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -26,7 +26,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install biopython pathos pytest psutil six + pip install biopython pathos pytest psutil six matplotlib - name: Run Unit Tests run: | diff --git a/Dockerfile b/Dockerfile index 877fb7a5..8150601f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,12 +4,13 @@ COPY . /opt/moPepGen ARG PYTHON_VER=3.8.11 ARG BIOPYTHON_VER=1.79 +ARG MATPLOTLIB_VER=3.3.1 RUN conda create -qy -p /usr/local\ python==${PYTHON_VER} RUN cd /opt/moPepGen/ && \ - pip install . biopython==${BIOPYTHON_VER} + pip install . biopython==${BIOPYTHON_VER} matplotlib==${MATPLOTLIB_VER} # Deploy the target tools into a base image FROM ubuntu:20.04 diff --git a/setup.cfg b/setup.cfg index 01bfe5ad..08656dae 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,6 +27,7 @@ install_requires = psutil pathos six + matplotlib [options.packages.find] where = . From c8fc50492e058805cf3ad7dc549aeff44efd1074 Mon Sep 17 00:00:00 2001 From: zhuchcn Date: Thu, 4 May 2023 17:36:58 +0800 Subject: [PATCH 3/4] fix (moPepGen): fixed import order and test cases because of the new arguments --- moPepGen/cli/summarize_fasta.py | 2 +- test/integration/test_summarize_fasta.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/moPepGen/cli/summarize_fasta.py b/moPepGen/cli/summarize_fasta.py index 83fda2b4..c51645e7 100644 --- a/moPepGen/cli/summarize_fasta.py +++ b/moPepGen/cli/summarize_fasta.py @@ -6,9 +6,9 @@ import argparse from contextlib import contextmanager from pathlib import Path -import matplotlib.pyplot as plt import sys from typing import IO +import matplotlib.pyplot as plt from moPepGen.cli import common from moPepGen.aa.PeptidePoolSummarizer import PeptidePoolSummarizer diff --git a/test/integration/test_summarize_fasta.py b/test/integration/test_summarize_fasta.py index 689f9687..2357ee80 100644 --- a/test/integration/test_summarize_fasta.py +++ b/test/integration/test_summarize_fasta.py @@ -17,6 +17,7 @@ def create_base_args(self) -> argparse.Namespace: args.order_source = None args.cleavage_rule = 'trypsin' args.output_path = self.work_dir/'output.txt' + args.output_image = None args.ignore_missing_source = False args.reference_source = None return args From 0df7ae7cd34e86d15aae6f877c06cdf681938695 Mon Sep 17 00:00:00 2001 From: Chenghao Zhu Date: Tue, 16 May 2023 15:25:08 +0800 Subject: [PATCH 4/4] fix (summarizeFasta): plot in log scale if the data is skewed --- moPepGen/aa/PeptidePoolSummarizer.py | 44 +++++++++++++++++++++------- moPepGen/cli/summarize_fasta.py | 20 ++++++++++++- 2 files changed, 52 insertions(+), 12 deletions(-) diff --git a/moPepGen/aa/PeptidePoolSummarizer.py b/moPepGen/aa/PeptidePoolSummarizer.py index a290021c..08f3c087 100644 --- a/moPepGen/aa/PeptidePoolSummarizer.py +++ b/moPepGen/aa/PeptidePoolSummarizer.py @@ -1,6 +1,7 @@ """ module for peptide pool summarizer """ from __future__ import annotations import itertools +import statistics from typing import Dict, IO, List, Set, FrozenSet, Tuple, Optional import matplotlib.pyplot as plt from moPepGen import gtf, seqvar @@ -240,9 +241,10 @@ def write_summary_table(self, handle:IO): record = self.summary_table.get_stringified_summary_entry(key) handle.write(record + '\n') - def create_barplot(self, width:float=6, height:float=8, ax:Optional[plt.Axes]=None - ) -> plt.Axes: + def create_barplot(self, width:float=6, height:float=8, ax:Optional[plt.Axes]=None, + scale:str=None) -> plt.Axes: """ Make a barplot of the summarized data. """ + # misc -> source -> count data:Dict[int,List[int]] = {} keys:List[str] = [] sources = [it[0] for it in sorted(self.order.items(), key=lambda x:x[1])] @@ -265,18 +267,38 @@ def create_barplot(self, width:float=6, height:float=8, ax:Optional[plt.Axes]=No data[x] = [] data[x].append(n) + totals = [] + for x in data.values(): + if not totals: + totals = x + else: + totals = [i+j for i,j in zip(totals, x)] + mean = statistics.mean(totals) + median = statistics.median(totals) + + # Unless specified, log scale is used if the summary distribution is skewed. + if scale not in ['normal', 'log']: + scale = 'normal' if mean / median <= 2.5 else 'log' + if ax is None: _, ax = plt.subplots(figsize=(width, height)) - offset = None - for i,n in enumerate(data.keys()): - vals = list(reversed(data[n])) - if offset: - vals = [x + y for x,y in zip(vals, offset)] + + if scale == 'log': ax.barh( - y=list(reversed(keys)), width=vals, left=offset, label=n, - color=COLOR_SCHEME[i] + y=list(reversed(keys)), width=list(reversed(totals)) ) - offset = vals + ax.set_xscale('log') + else: + offset = None + for i,n in enumerate(data.keys()): + vals = list(reversed(data[n])) + if offset: + vals = [x + y for x,y in zip(vals, offset)] + ax.barh( + y=list(reversed(keys)), width=vals, left=offset, label=n, + color=COLOR_SCHEME[i] + ) + offset = vals + ax.legend(title='Miscleavages') ax.set_xlabel('Number of Peptides') - ax.legend(title='Miscleavages') return ax diff --git a/moPepGen/cli/summarize_fasta.py b/moPepGen/cli/summarize_fasta.py index c51645e7..61cf6d0f 100644 --- a/moPepGen/cli/summarize_fasta.py +++ b/moPepGen/cli/summarize_fasta.py @@ -84,6 +84,18 @@ def add_subparser_summarize_fasta(subparser:argparse._SubParsersAction): action='store_true', help='Ignore the sources missing from input GVF.' ) + group_plot_scale = p.add_mutually_exclusive_group() + group_plot_scale.add_argument( + '--plot-normal-scale', + action='store_true', + help='Draw the summary bar plot in normal scale.' + ) + group_plot_scale.add_argument( + '--plot-log-scale', + action='store_true', + help='Draw the summary bar plot in log scale.' + ) + common.add_args_cleavage(p, enzyme_only=True) common.add_args_reference(p, genome=False, proteome=True) common.add_args_quiet(p) @@ -158,6 +170,12 @@ def summarize_fasta(args:argparse.Namespace) -> None: summarizer.write_summary_table(handle) if args.output_image: + if args.plot_log_scale: + scale = 'log' + elif args.plot_normal_scale: + scale = 'normal' + else: + scale = None fig, ax = plt.subplots(figsize=(8, 8)) - summarizer.create_barplot(ax=ax) + summarizer.create_barplot(ax=ax, scale=scale) fig.savefig(args.output_image, bbox_inches="tight")