Skip to content

Commit

Permalink
Merge pull request #734 from uclahs-cds/czhu-add-plot-summary
Browse files Browse the repository at this point in the history
Add a barplot of the summarizeFasta results
  • Loading branch information
zhuchcn authored May 17, 2023
2 parents 9462827 + 9250b54 commit 3d4f7e9
Show file tree
Hide file tree
Showing 8 changed files with 120 additions and 6 deletions.
1 change: 1 addition & 0 deletions .github/workflows/docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ jobs:
echo 'jinja2==3.0.0' >> requirements.txt
echo 'mkdocstrings==0.16.2' >> requirements.txt
echo 'mkdocs-macros-plugin==0.6.0' >> requirements.txt
echo 'matplotlib==3.3.1' >> requirements.txt
echo '.' >> requirements.txt
- name: Deploy docs
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install biopython pathos pytest psutil six
pip install biopython pathos pytest psutil six matplotlib
- name: Run Unit Tests
run: |
Expand Down
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

- `callAltTranslation` added to call peptides with alternative translation without any genomic or transcriptomic variations.

- Enabled `summarizeFasta` to create bar plot of the summary results.

### Fixed

- Fixed `fake` that simulated selenocysteine positions could be in introns.
Expand Down
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@ COPY . /opt/moPepGen

ARG PYTHON_VER=3.8.11
ARG BIOPYTHON_VER=1.79
ARG MATPLOTLIB_VER=3.3.1

RUN conda create -qy -p /usr/local\
python==${PYTHON_VER}

RUN cd /opt/moPepGen/ && \
pip install . biopython==${BIOPYTHON_VER}
pip install . biopython==${BIOPYTHON_VER} matplotlib==${MATPLOTLIB_VER}

# Deploy the target tools into a base image
FROM ubuntu:20.04
Expand Down
73 changes: 72 additions & 1 deletion moPepGen/aa/PeptidePoolSummarizer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
""" module for peptide pool summarizer """
from __future__ import annotations
import itertools
from typing import Dict, IO, List, Set, FrozenSet, Tuple
import statistics
from typing import Dict, IO, List, Set, FrozenSet, Tuple, Optional
import matplotlib.pyplot as plt
from moPepGen import gtf, seqvar
from moPepGen.aa.AminoAcidSeqRecord import AminoAcidSeqRecord
from moPepGen.aa.VariantPeptideLabel import VariantPeptideInfo, \
Expand Down Expand Up @@ -48,6 +50,13 @@
]
}

# BPF default color scheme.
# Hopefully no one is using more than 11 miscleavages
COLOR_SCHEME = [
"#FFA500", "#458B00", "#68228B", "#FFD700", "#1E90FF", "#CD2626", "#9ACD32"
"#FF7F00", "#473C8B", "#43CD80", "#CD3278", "#00C5CD"
]

class NoncanonicalPeptideSummaryTable():
""" Summary table for noncaonincal peptides
Expand Down Expand Up @@ -231,3 +240,65 @@ def write_summary_table(self, handle:IO):
key = frozenset(comb)
record = self.summary_table.get_stringified_summary_entry(key)
handle.write(record + '\n')

def create_barplot(self, width:float=6, height:float=8, ax:Optional[plt.Axes]=None,
scale:str=None) -> plt.Axes:
""" Make a barplot of the summarized data. """
# misc -> source -> count
data:Dict[int,List[int]] = {}
keys:List[str] = []
sources = [it[0] for it in sorted(self.order.items(), key=lambda x:x[1])]
for i in range(len(sources)):
for comb in itertools.combinations(sources, i + 1):
if self.ignore_missing_source:
# Ignore it if the source isn't present in any GVF given.
if any(not self.summary_table.has_source(k) for k in comb):
continue
if self.contains_exclusive_sources(comb):
continue
key = frozenset(comb)
if key not in self.summary_table.data:
continue

keys.append('-'.join(key))
for x in range(self.summary_table.max_misc + 1):
n = self.summary_table.get_n_x_misc(key, x)
if x not in data:
data[x] = []
data[x].append(n)

totals = []
for x in data.values():
if not totals:
totals = x
else:
totals = [i+j for i,j in zip(totals, x)]
mean = statistics.mean(totals)
median = statistics.median(totals)

# Unless specified, log scale is used if the summary distribution is skewed.
if scale not in ['normal', 'log']:
scale = 'normal' if mean / median <= 2.5 else 'log'

if ax is None:
_, ax = plt.subplots(figsize=(width, height))

if scale == 'log':
ax.barh(
y=list(reversed(keys)), width=list(reversed(totals))
)
ax.set_xscale('log')
else:
offset = None
for i,n in enumerate(data.keys()):
vals = list(reversed(data[n]))
if offset:
vals = [x + y for x,y in zip(vals, offset)]
ax.barh(
y=list(reversed(keys)), width=vals, left=offset, label=n,
color=COLOR_SCHEME[i]
)
offset = vals
ax.legend(title='Miscleavages')
ax.set_xlabel('Number of Peptides')
return ax
43 changes: 40 additions & 3 deletions moPepGen/cli/summarize_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,15 @@
from pathlib import Path
import sys
from typing import IO
import matplotlib.pyplot as plt
from moPepGen.cli import common
from moPepGen.aa.PeptidePoolSummarizer import PeptidePoolSummarizer


GVF_FILE_FORMAT = ['.gvf']
FASTA_FILE_FORMAT = ['.fasta', '.fa']
OUTPUT_FILE_FORMATS = ['.txt', 'tsv']
OUTPUT_IMAGE_FORMATS = ['.pdf', '.jpg', '.jpeg', '.png']


# pylint: disable=W0212
Expand Down Expand Up @@ -70,11 +72,30 @@ def add_subparser_summarize_fasta(subparser:argparse._SubParsersAction):
metavar='<file>',
default=None
)
p.add_argument(
'--output-image',
type=Path,
help=f"File path to the output barplot. Valid formats: {OUTPUT_IMAGE_FORMATS}",
metavar='<file>',
default=None
)
p.add_argument(
'--ignore-missing-source',
action='store_true',
help='Ignore the sources missing from input GVF.'
)
group_plot_scale = p.add_mutually_exclusive_group()
group_plot_scale.add_argument(
'--plot-normal-scale',
action='store_true',
help='Draw the summary bar plot in normal scale.'
)
group_plot_scale.add_argument(
'--plot-log-scale',
action='store_true',
help='Draw the summary bar plot in log scale.'
)

common.add_args_cleavage(p, enzyme_only=True)
common.add_args_reference(p, genome=False, proteome=True)
common.add_args_quiet(p)
Expand Down Expand Up @@ -102,9 +123,14 @@ def summarize_fasta(args:argparse.Namespace) -> None:
args.variant_peptides, FASTA_FILE_FORMAT, check_readable=True
)

common.validate_file_format(
args.output_path, OUTPUT_FILE_FORMATS, check_writable=True
)
if args.output_path is not None:
common.validate_file_format(
args.output_path, OUTPUT_FILE_FORMATS, check_writable=True
)
if args.output_image is not None:
common.validate_file_format(
args.output_image, OUTPUT_IMAGE_FORMATS, check_writable=True
)

common.print_start_message(args)

Expand Down Expand Up @@ -142,3 +168,14 @@ def summarize_fasta(args:argparse.Namespace) -> None:

with output_context(args.output_path) as handle:
summarizer.write_summary_table(handle)

if args.output_image:
if args.plot_log_scale:
scale = 'log'
elif args.plot_normal_scale:
scale = 'normal'
else:
scale = None
fig, ax = plt.subplots(figsize=(8, 8))
summarizer.create_barplot(ax=ax, scale=scale)
fig.savefig(args.output_image, bbox_inches="tight")
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ install_requires =
psutil
pathos
six
matplotlib

[options.packages.find]
where = .
Expand Down
1 change: 1 addition & 0 deletions test/integration/test_summarize_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def create_base_args(self) -> argparse.Namespace:
args.order_source = None
args.cleavage_rule = 'trypsin'
args.output_path = self.work_dir/'output.txt'
args.output_image = None
args.ignore_missing_source = False
args.reference_source = None
return args
Expand Down

0 comments on commit 3d4f7e9

Please sign in to comment.