Skip to content

Commit

Permalink
[RELEASE] 0.1.5
Browse files Browse the repository at this point in the history
  • Loading branch information
grosenberger committed Feb 14, 2020
1 parent c193704 commit 85f4a28
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 58 deletions.
42 changes: 4 additions & 38 deletions easypqp/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ def generate_ionseries(peptide_sequence, precursor_charge, fragment_charges=[1,2

return list(fragments.keys()), np.fromiter(fragments.values(), np.float, len(fragments))

def conversion(pepxmlfile, spectralfile, unimodfile, main_score, exclude_range, max_delta_unimod, max_delta_ppm, fragment_types, fragment_charges, enable_specific_losses, enable_unspecific_losses):
def conversion(pepxmlfile, spectralfile, unimodfile, exclude_range, max_delta_unimod, max_delta_ppm, fragment_types, fragment_charges, enable_specific_losses, enable_unspecific_losses):
# Parse basename
base_name = basename_spectralfile(spectralfile)
click.echo("Info: Parsing run %s." % base_name)
Expand All @@ -469,46 +469,12 @@ def conversion(pepxmlfile, spectralfile, unimodfile, main_score, exclude_range,
um = unimod(unimodfile, max_delta_unimod)

# Parse pepXML
click.echo("Info: Parsing pepXML.")
px = pepxml(pepxmlfile, um, base_name, exclude_range)
psms = px.get()

# Continue if any PSMS are present
if psms.shape[0] > 0:
# Generate UniMod peptide sequence
click.echo("Info: Matching modifications to UniMod.")

# Append PyProphet columns
run_id = basename_spectralfile(spectralfile)
psms['group_id'] = psms['run_id'] + "_" + psms['scan_id'].astype(str)

if 'var_expect' in psms.columns:
psms = psms.rename(index=str, columns={'var_expect': 'expect'})
psms['var_expectscore'] = 0.0 - np.log(psms['expect'])

if 'var_nextscore' in psms.columns and 'var_hyperscore' in psms.columns:
psms = psms.rename(index=str, columns={'var_nextscore': 'nextscore'})
psms['var_deltascore'] = 1.0 - (psms['nextscore'] / psms['var_hyperscore'])

# DIA-Umpire quality tiers
if run_id.endswith("_Q1"):
psms['quality'] = 1
elif run_id.endswith("_Q2"):
psms['quality'] = 2
elif run_id.endswith("_Q3"):
psms['quality'] = 3
else: # DDA data
psms['quality'] = 0

if main_score not in psms.columns:
raise click.ClickException("Error: Main score '%s' is not present in pepXML." % main_score)

psms = psms.rename(index=str, columns={main_score: 'main_' + main_score})

# Check if pepXML is processed by TPP
if 'pep' in psms.columns:
tpp = True
else:
tpp = False

# Generate theoretical spectra
click.echo("Info: Generate theoretical spectra.")
Expand All @@ -529,9 +495,9 @@ def conversion(pepxmlfile, spectralfile, unimodfile, main_score, exclude_range,
# Round floating numbers
peaks = peaks.round(6)

return psms, peaks, tpp
return psms, peaks
else:
return pd.DataFrame({'run_id': [], 'scan_id': [], 'hit_rank': [], 'massdiff': [], 'precursor_charge': [], 'retention_time': [], 'ion_mobility': [], 'peptide_sequence': [], 'modifications': [], 'nterm_modification': [], 'cterm_modification': [], 'protein_id': [], 'gene_id': [], 'num_tot_proteins': [], 'decoy': []}), pd.DataFrame({'scan_id': [], 'modified_peptide': [], 'precursor_charge': [], 'precursor_mz': [], 'fragment': [], 'product_mz': [], 'intensity': []}), True
return pd.DataFrame({'run_id': [], 'scan_id': [], 'hit_rank': [], 'massdiff': [], 'precursor_charge': [], 'retention_time': [], 'ion_mobility': [], 'peptide_sequence': [], 'modifications': [], 'nterm_modification': [], 'cterm_modification': [], 'protein_id': [], 'gene_id': [], 'num_tot_proteins': [], 'decoy': []}), pd.DataFrame({'scan_id': [], 'modified_peptide': [], 'precursor_charge': [], 'precursor_mz': [], 'fragment': [], 'product_mz': [], 'intensity': []})

def basename_spectralfile(spectralfile):
'''
Expand Down
20 changes: 16 additions & 4 deletions easypqp/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,19 +190,19 @@ def lowess(run, reference_run, xcol, ycol, lowess_frac, psm_fdr_threshold, min_p

return run

def generate(files, outfile, psmtsv, peptidetsv, rt_referencefile, im_referencefile, psm_fdr_threshold, peptide_fdr_threshold, protein_fdr_threshold, rt_lowess_frac, rt_psm_fdr_threshold, im_lowess_frac, im_psm_fdr_threshold, pi0_lambda, peptide_plot_path, protein_plot_path, min_peptides, proteotypic, consensus):
def generate(files, outfile, psmtsv, peptidetsv, rt_referencefile, rt_filter, im_referencefile, im_filter, psm_fdr_threshold, peptide_fdr_threshold, protein_fdr_threshold, rt_lowess_frac, rt_psm_fdr_threshold, im_lowess_frac, im_psm_fdr_threshold, pi0_lambda, peptide_plot_path, protein_plot_path, min_peptides, proteotypic, consensus):
# Parse input arguments
psm_files = []
spectra = []

for file in files:
if 'psms' in file:
if 'psmpkl' in file:
psm_files.append(file)
if 'peakpkl' in file:
spectra.append(file)

if len(psm_files) == 0:
raise click.ClickException("No PSMs files present. Need to have tag 'psms' in filename.")
raise click.ClickException("No PSMs files present. Need to have tag 'psmpkl' in filename.")

if len(spectra) == 0:
raise click.ClickException("No spectrum files present. Need to have tag 'peakpkl' in filename.")
Expand All @@ -219,7 +219,7 @@ def generate(files, outfile, psmtsv, peptidetsv, rt_referencefile, im_referencef
psms_list = []
for psm_file in psm_files:
click.echo("Info: Reading file %s." % psm_file)
psm_tab = pd.read_csv(psm_file, index_col=False, sep='\t')
psm_tab = pd.read_pickle(psm_file)
if psm_tab.shape[0] > 0:
psms_list.append(psm_tab)
psms = pd.concat(psms_list).reset_index(drop=True)
Expand Down Expand Up @@ -253,6 +253,12 @@ def generate(files, outfile, psmtsv, peptidetsv, rt_referencefile, im_referencef
# Select reference run
pepidr_stats = pepidr.groupby('base_name')[['modified_peptide']].count().reset_index()
click.echo(pepidr_stats)

if im_filter is not None:
click.echo("Info: Filter candidate IM reference runs by tag '%s'." % im_filter)
pepidr_stats = pepidr_stats[pepidr_stats['base_name'].str.contains(im_filter)]
click.echo(pepidr_stats)

im_reference_run_base_name = pepidr_stats.loc[pepidr_stats['modified_peptide'].idxmax()]['base_name']

im_reference_run = pepidr[pepidr['base_name'] == im_reference_run_base_name].copy()
Expand All @@ -272,6 +278,12 @@ def generate(files, outfile, psmtsv, peptidetsv, rt_referencefile, im_referencef
# Select reference run
pepidr_stats = pepidr.groupby('base_name')[['modified_peptide']].count().reset_index()
click.echo(pepidr_stats)

if rt_filter is not None:
click.echo("Info: Filter candidate RT reference runs by tag '%s'." % rt_filter)
pepidr_stats = pepidr_stats[pepidr_stats['base_name'].str.contains(rt_filter)]
click.echo(pepidr_stats)

rt_reference_run_base_name = pepidr_stats.loc[pepidr_stats['modified_peptide'].idxmax()]['base_name']

rt_reference_run = pepidr[pepidr['base_name'] == rt_reference_run_base_name].copy()
Expand Down
23 changes: 8 additions & 15 deletions easypqp/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,7 @@ def cli():
@click.option('--spectra', 'spectralfile', required=True, type=click.Path(exists=True), help='The input mzXML or MGF (timsTOF only) file.')
@click.option('--unimod', 'unimodfile', required=False, type=click.Path(exists=True), help='The input UniMod XML file.')
@click.option('--psms', 'psmsfile', required=False, type=click.Path(exists=False), help='Output PSMs file.')
@click.option('--subpsms', 'subpsmsfile', required=False, type=click.Path(exists=False), help='Output subsampled PSMs file.')
@click.option('--peaks', 'peaksfile', required=False, type=click.Path(exists=False), help='Output peaks file.')
@click.option('--main_score', default="var_expectscore", show_default=True, type=str, help='Main score to use for PyProphet.')
@click.option('--exclude-range', 'exclude_range_str', default="-1.5,3.5", show_default=True, required=False, type=str, help='massdiff in this range will not be mapped to UniMod.')
@click.option('--max_delta_unimod', default=0.02, show_default=True, type=float, help='Maximum delta mass (Dalton) for UniMod annotation.')
@click.option('--max_delta_ppm', default=15, show_default=True, type=float, help='Maximum delta mass (PPM) for annotation.')
Expand All @@ -37,7 +35,7 @@ def cli():
@click.option('--enable_specific_losses/--no-enable_specific_losses', default=False, show_default=True, help='Enable specific fragment ion losses.')
@click.option('--enable_unspecific_losses/--no-enable_unspecific_losses', default=False, show_default=True, help='Enable unspecific fragment ion losses.')
@click.option('--subsample_fraction', default=1.0, show_default=True, type=float, help='Data fraction used for subsampling.')
def convert(pepxmlfile, spectralfile, unimodfile, psmsfile, subpsmsfile, peaksfile, main_score, exclude_range_str, max_delta_unimod, max_delta_ppm, fragment_types, fragment_charges, enable_specific_losses, enable_unspecific_losses, subsample_fraction):
def convert(pepxmlfile, spectralfile, unimodfile, psmsfile, peaksfile, exclude_range_str, max_delta_unimod, max_delta_ppm, fragment_types, fragment_charges, enable_specific_losses, enable_unspecific_losses, subsample_fraction):
"""
Convert pepXML files for EasyPQP
"""
Expand All @@ -47,26 +45,19 @@ def convert(pepxmlfile, spectralfile, unimodfile, psmsfile, subpsmsfile, peaksfi

run_id = basename_spectralfile(spectralfile)
if psmsfile is None:
psmsfile = run_id + "_psms.tsv"
if subpsmsfile is None:
subpsmsfile = run_id + "_subpsms.tsv"
psmsfile = run_id + ".psmpkl"
if peaksfile is None:
peaksfile = run_id + ".peakpkl"

temp = exclude_range_str.split(',')
exclude_range = [float(temp[0]), float(temp[1])]

click.echo("Info: Converting %s." % pepxmlfile)
psms, peaks, tpp = conversion(pepxmlfile, spectralfile, unimodfile, main_score, exclude_range, max_delta_unimod, max_delta_ppm, fragment_types, fragment_charges, enable_specific_losses, enable_unspecific_losses)
psms, peaks = conversion(pepxmlfile, spectralfile, unimodfile, exclude_range, max_delta_unimod, max_delta_ppm, fragment_types, fragment_charges, enable_specific_losses, enable_unspecific_losses)

psms.to_csv(psmsfile, sep="\t", index=False)
psms.to_pickle(psmsfile)
click.echo("Info: PSMs successfully converted and stored in %s." % psmsfile)

if not tpp:
subpsms = psms.sample(frac=subsample_fraction)
subpsms.to_csv(subpsmsfile, sep="\t", index=False)
click.echo("Info: Subsampled PSMs successfully converted and stored in %s." % subpsmsfile)

peaks.to_pickle(peaksfile)
click.echo("Info: Peaks successfully converted and stored in %s." % peaksfile)

Expand All @@ -77,7 +68,9 @@ def convert(pepxmlfile, spectralfile, unimodfile, psmsfile, subpsmsfile, peaksfi
@click.option('--psmtsv', 'psmtsv', required=False, type=click.Path(exists=False), help='psm.tsv file from Philosopher.')
@click.option('--peptidetsv', 'peptidetsv', required=False, type=click.Path(exists=False), help='peptide.tsv file from Philosopher.')
@click.option('--rt_reference', 'rt_referencefile', required=False, type=click.Path(exists=True), help='Optional iRT/CiRT reference file.')
@click.option('--rt_filter', 'rt_filter', required=False, type=str, help='Optional tag to filter candidate RT reference runs.')
@click.option('--im_reference', 'im_referencefile', required=False, type=click.Path(exists=True), help='Optional IM reference file.')
@click.option('--im_filter', 'im_filter', required=False, type=str, help='Optional tag to filter candidate IM reference runs.')
@click.option('--psm_fdr_threshold', default=0.01, show_default=True, type=float, help='PSM FDR threshold.')
@click.option('--peptide_fdr_threshold', default=0.01, show_default=True, type=float, help='Peptide FDR threshold.')
@click.option('--protein_fdr_threshold', default=0.01, show_default=True, type=float, help='Protein FDR threshold.')
Expand All @@ -91,12 +84,12 @@ def convert(pepxmlfile, spectralfile, unimodfile, psmsfile, subpsmsfile, peaksfi
@click.option('--min_peptides', default=5, show_default=True, type=int, help='Minimum peptides required for successful alignment.')
@click.option('--proteotypic/--no-proteotypic', show_default=True, default=True, help='Use only proteotypic, unique, non-shared peptides.')
@click.option('--consensus/--no-consensus', show_default=True, default=True, help='Generate consensus instead of best replicate spectra.')
def library(infiles, outfile, psmtsv, peptidetsv, rt_referencefile, im_referencefile, psm_fdr_threshold, peptide_fdr_threshold, protein_fdr_threshold, rt_lowess_fraction, rt_psm_fdr_threshold, im_lowess_fraction, im_psm_fdr_threshold, pi0_lambda, peptide_plot_path, protein_plot_path, min_peptides, proteotypic, consensus):
def library(infiles, outfile, psmtsv, peptidetsv, rt_referencefile, rt_filter, im_referencefile, im_filter, psm_fdr_threshold, peptide_fdr_threshold, protein_fdr_threshold, rt_lowess_fraction, rt_psm_fdr_threshold, im_lowess_fraction, im_psm_fdr_threshold, pi0_lambda, peptide_plot_path, protein_plot_path, min_peptides, proteotypic, consensus):
"""
Generate EasyPQP library
"""

generate(infiles, outfile, psmtsv, peptidetsv, rt_referencefile, im_referencefile, psm_fdr_threshold, peptide_fdr_threshold, protein_fdr_threshold, rt_lowess_fraction, rt_psm_fdr_threshold, im_lowess_fraction, im_psm_fdr_threshold, pi0_lambda, peptide_plot_path, protein_plot_path, min_peptides, proteotypic, consensus)
generate(infiles, outfile, psmtsv, peptidetsv, rt_referencefile, rt_filter, im_referencefile, im_filter, psm_fdr_threshold, peptide_fdr_threshold, protein_fdr_threshold, rt_lowess_fraction, rt_psm_fdr_threshold, im_lowess_fraction, im_psm_fdr_threshold, pi0_lambda, peptide_plot_path, protein_plot_path, min_peptides, proteotypic, consensus)
click.echo("Info: Library successfully generated.")

# EasyPQP Reduce
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

setup(
name='easypqp',
version='0.1.4',
version='0.1.5',
description='EasyPQP: Simple library generation for OpenSWATH',
long_description=long_description,
long_description_content_type='text/markdown',
Expand Down

0 comments on commit 85f4a28

Please sign in to comment.