Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a barplot of the summarizeFasta results #734

Merged
merged 5 commits into from
May 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ jobs:
echo 'jinja2==3.0.0' >> requirements.txt
echo 'mkdocstrings==0.16.2' >> requirements.txt
echo 'mkdocs-macros-plugin==0.6.0' >> requirements.txt
echo 'matplotlib==3.3.1' >> requirements.txt
echo '.' >> requirements.txt

- name: Deploy docs
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install biopython pathos pytest psutil six
pip install biopython pathos pytest psutil six matplotlib

- name: Run Unit Tests
run: |
Expand Down
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

- `callAltTranslation` added to call peptides with alternative translation without any genomic or transcriptomic variations.

- Enabled `summarizeFasta` to create bar plot of the summary results.

### Fixed

- Fixed `fake` that simulated selenocysteine positions could be in introns.
Expand Down
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ COPY . /opt/moPepGen

ARG PYTHON_VER=3.8.11
ARG BIOPYTHON_VER=1.79
ARG MATPLOTLIB_VER=3.3.1

RUN conda create -qy -p /usr/local\
python==${PYTHON_VER}

RUN cd /opt/moPepGen/ && \
pip install . biopython==${BIOPYTHON_VER}
pip install . biopython==${BIOPYTHON_VER} matplotlib==${MATPLOTLIB_VER}

# Deploy the target tools into a base image
FROM ubuntu:20.04
Expand Down
73 changes: 72 additions & 1 deletion moPepGen/aa/PeptidePoolSummarizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
""" module for peptide pool summarizer """
from __future__ import annotations
import itertools
from typing import Dict, IO, List, Set, FrozenSet, Tuple
import statistics
from typing import Dict, IO, List, Set, FrozenSet, Tuple, Optional
import matplotlib.pyplot as plt
from moPepGen import gtf, seqvar
from moPepGen.aa.AminoAcidSeqRecord import AminoAcidSeqRecord
from moPepGen.aa.VariantPeptideLabel import VariantPeptideInfo, \
Expand Down Expand Up @@ -48,6 +50,13 @@
]
}

# BPF default color scheme.
# Hopefully no one is using more than 11 miscleavages
zhuchcn marked this conversation as resolved.
Show resolved Hide resolved
COLOR_SCHEME = [
"#FFA500", "#458B00", "#68228B", "#FFD700", "#1E90FF", "#CD2626", "#9ACD32"
"#FF7F00", "#473C8B", "#43CD80", "#CD3278", "#00C5CD"
]

class NoncanonicalPeptideSummaryTable():
""" Summary table for noncaonincal peptides

Expand Down Expand Up @@ -231,3 +240,65 @@ def write_summary_table(self, handle:IO):
key = frozenset(comb)
record = self.summary_table.get_stringified_summary_entry(key)
handle.write(record + '\n')

def create_barplot(self, width:float=6, height:float=8, ax:Optional[plt.Axes]=None,
scale:str=None) -> plt.Axes:
""" Make a barplot of the summarized data. """
# misc -> source -> count
data:Dict[int,List[int]] = {}
keys:List[str] = []
sources = [it[0] for it in sorted(self.order.items(), key=lambda x:x[1])]
for i in range(len(sources)):
for comb in itertools.combinations(sources, i + 1):
if self.ignore_missing_source:
# Ignore it if the source isn't present in any GVF given.
if any(not self.summary_table.has_source(k) for k in comb):
continue
if self.contains_exclusive_sources(comb):
continue
key = frozenset(comb)
if key not in self.summary_table.data:
continue

keys.append('-'.join(key))
for x in range(self.summary_table.max_misc + 1):
n = self.summary_table.get_n_x_misc(key, x)
if x not in data:
data[x] = []
data[x].append(n)

totals = []
for x in data.values():
if not totals:
totals = x
else:
totals = [i+j for i,j in zip(totals, x)]
mean = statistics.mean(totals)
median = statistics.median(totals)

# Unless specified, log scale is used if the summary distribution is skewed.
if scale not in ['normal', 'log']:
scale = 'normal' if mean / median <= 2.5 else 'log'

if ax is None:
_, ax = plt.subplots(figsize=(width, height))

if scale == 'log':
ax.barh(
y=list(reversed(keys)), width=list(reversed(totals))
)
ax.set_xscale('log')
else:
offset = None
for i,n in enumerate(data.keys()):
vals = list(reversed(data[n]))
if offset:
vals = [x + y for x,y in zip(vals, offset)]
ax.barh(
y=list(reversed(keys)), width=vals, left=offset, label=n,
color=COLOR_SCHEME[i]
)
offset = vals
ax.legend(title='Miscleavages')
ax.set_xlabel('Number of Peptides')
return ax
43 changes: 40 additions & 3 deletions moPepGen/cli/summarize_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
from pathlib import Path
import sys
from typing import IO
import matplotlib.pyplot as plt
from moPepGen.cli import common
from moPepGen.aa.PeptidePoolSummarizer import PeptidePoolSummarizer


GVF_FILE_FORMAT = ['.gvf']
FASTA_FILE_FORMAT = ['.fasta', '.fa']
OUTPUT_FILE_FORMATS = ['.txt', 'tsv']
OUTPUT_IMAGE_FORMATS = ['.pdf', '.jpg', '.jpeg', '.png']


# pylint: disable=W0212
Expand Down Expand Up @@ -70,11 +72,30 @@ def add_subparser_summarize_fasta(subparser:argparse._SubParsersAction):
metavar='<file>',
default=None
)
p.add_argument(
'--output-image',
type=Path,
help=f"File path to the output barplot. Valid formats: {OUTPUT_IMAGE_FORMATS}",
metavar='<file>',
default=None
)
p.add_argument(
'--ignore-missing-source',
action='store_true',
help='Ignore the sources missing from input GVF.'
)
group_plot_scale = p.add_mutually_exclusive_group()
group_plot_scale.add_argument(
'--plot-normal-scale',
action='store_true',
help='Draw the summary bar plot in normal scale.'
)
group_plot_scale.add_argument(
'--plot-log-scale',
action='store_true',
help='Draw the summary bar plot in log scale.'
)

common.add_args_cleavage(p, enzyme_only=True)
common.add_args_reference(p, genome=False, proteome=True)
common.add_args_quiet(p)
Expand Down Expand Up @@ -102,9 +123,14 @@ def summarize_fasta(args:argparse.Namespace) -> None:
args.variant_peptides, FASTA_FILE_FORMAT, check_readable=True
)

common.validate_file_format(
args.output_path, OUTPUT_FILE_FORMATS, check_writable=True
)
if args.output_path is not None:
common.validate_file_format(
args.output_path, OUTPUT_FILE_FORMATS, check_writable=True
)
if args.output_image is not None:
common.validate_file_format(
args.output_image, OUTPUT_IMAGE_FORMATS, check_writable=True
)

common.print_start_message(args)

Expand Down Expand Up @@ -142,3 +168,14 @@ def summarize_fasta(args:argparse.Namespace) -> None:

with output_context(args.output_path) as handle:
summarizer.write_summary_table(handle)

if args.output_image:
if args.plot_log_scale:
scale = 'log'
elif args.plot_normal_scale:
scale = 'normal'
else:
scale = None
fig, ax = plt.subplots(figsize=(8, 8))
summarizer.create_barplot(ax=ax, scale=scale)
fig.savefig(args.output_image, bbox_inches="tight")
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ install_requires =
psutil
pathos
six
matplotlib

[options.packages.find]
where = .
Expand Down
1 change: 1 addition & 0 deletions test/integration/test_summarize_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def create_base_args(self) -> argparse.Namespace:
args.order_source = None
args.cleavage_rule = 'trypsin'
args.output_path = self.work_dir/'output.txt'
args.output_image = None
args.ignore_missing_source = False
args.reference_source = None
return args
Expand Down