[RELEASE] 0.1.5

grosenberger · Feb 14, 2020 · 85f4a28 · 85f4a28
1 parent c193704
commit 85f4a28
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 58 deletions.
diff --git a/easypqp/convert.py b/easypqp/convert.py
@@ -460,7 +460,7 @@ def generate_ionseries(peptide_sequence, precursor_charge, fragment_charges=[1,2
 
 	return list(fragments.keys()), np.fromiter(fragments.values(), np.float, len(fragments))
 
-def conversion(pepxmlfile, spectralfile, unimodfile, main_score, exclude_range, max_delta_unimod, max_delta_ppm, fragment_types, fragment_charges, enable_specific_losses, enable_unspecific_losses):
+def conversion(pepxmlfile, spectralfile, unimodfile, exclude_range, max_delta_unimod, max_delta_ppm, fragment_types, fragment_charges, enable_specific_losses, enable_unspecific_losses):
 	# Parse basename
 	base_name = basename_spectralfile(spectralfile)
 	click.echo("Info: Parsing run %s." % base_name)
@@ -469,46 +469,12 @@ def conversion(pepxmlfile, spectralfile, unimodfile, main_score, exclude_range,
 	um = unimod(unimodfile, max_delta_unimod)
 
 	# Parse pepXML
+	click.echo("Info: Parsing pepXML.")
 	px = pepxml(pepxmlfile, um, base_name, exclude_range)
 	psms = px.get()
 
 	# Continue if any PSMS are present
 	if psms.shape[0] > 0:
-		# Generate UniMod peptide sequence
-		click.echo("Info: Matching modifications to UniMod.")
-
-		# Append PyProphet columns
-		run_id = basename_spectralfile(spectralfile)
-		psms['group_id'] = psms['run_id'] + "_" + psms['scan_id'].astype(str)
-
-		if 'var_expect' in psms.columns:
-			psms = psms.rename(index=str, columns={'var_expect': 'expect'})
-			psms['var_expectscore'] = 0.0 - np.log(psms['expect'])
-
-		if 'var_nextscore' in psms.columns and 'var_hyperscore' in psms.columns:
-			psms = psms.rename(index=str, columns={'var_nextscore': 'nextscore'})
-			psms['var_deltascore'] = 1.0 - (psms['nextscore'] / psms['var_hyperscore'])
-
-		# DIA-Umpire quality tiers
-		if run_id.endswith("_Q1"):
-			psms['quality'] = 1
-		elif run_id.endswith("_Q2"):
-			psms['quality'] = 2
-		elif run_id.endswith("_Q3"):
-			psms['quality'] = 3
-		else: # DDA data
-			psms['quality'] = 0
-
-		if main_score not in psms.columns:
-			raise click.ClickException("Error: Main score '%s' is not present in pepXML." % main_score)
-
-		psms = psms.rename(index=str, columns={main_score: 'main_' + main_score})
-
-		# Check if pepXML is processed by TPP
-		if 'pep' in psms.columns:
-			tpp = True
-		else:
-			tpp = False
 
 		# Generate theoretical spectra
 		click.echo("Info: Generate theoretical spectra.")
@@ -529,9 +495,9 @@ def conversion(pepxmlfile, spectralfile, unimodfile, main_score, exclude_range,
 		# Round floating numbers
 		peaks = peaks.round(6)
 
-		return psms, peaks, tpp
+		return psms, peaks
 	else:
-		return pd.DataFrame({'run_id': [], 'scan_id': [], 'hit_rank': [], 'massdiff': [], 'precursor_charge': [], 'retention_time': [], 'ion_mobility': [], 'peptide_sequence': [], 'modifications': [], 'nterm_modification': [], 'cterm_modification': [], 'protein_id': [], 'gene_id': [], 'num_tot_proteins': [], 'decoy': []}), pd.DataFrame({'scan_id': [], 'modified_peptide': [], 'precursor_charge': [], 'precursor_mz': [], 'fragment': [], 'product_mz': [], 'intensity': []}), True
+		return pd.DataFrame({'run_id': [], 'scan_id': [], 'hit_rank': [], 'massdiff': [], 'precursor_charge': [], 'retention_time': [], 'ion_mobility': [], 'peptide_sequence': [], 'modifications': [], 'nterm_modification': [], 'cterm_modification': [], 'protein_id': [], 'gene_id': [], 'num_tot_proteins': [], 'decoy': []}), pd.DataFrame({'scan_id': [], 'modified_peptide': [], 'precursor_charge': [], 'precursor_mz': [], 'fragment': [], 'product_mz': [], 'intensity': []})
 
 def basename_spectralfile(spectralfile):
 	'''

diff --git a/easypqp/library.py b/easypqp/library.py
@@ -190,19 +190,19 @@ def lowess(run, reference_run, xcol, ycol, lowess_frac, psm_fdr_threshold, min_p
 
   return run
 
-def generate(files, outfile, psmtsv, peptidetsv, rt_referencefile, im_referencefile, psm_fdr_threshold, peptide_fdr_threshold, protein_fdr_threshold, rt_lowess_frac, rt_psm_fdr_threshold, im_lowess_frac, im_psm_fdr_threshold, pi0_lambda, peptide_plot_path, protein_plot_path, min_peptides, proteotypic, consensus):
+def generate(files, outfile, psmtsv, peptidetsv, rt_referencefile, rt_filter, im_referencefile, im_filter, psm_fdr_threshold, peptide_fdr_threshold, protein_fdr_threshold, rt_lowess_frac, rt_psm_fdr_threshold, im_lowess_frac, im_psm_fdr_threshold, pi0_lambda, peptide_plot_path, protein_plot_path, min_peptides, proteotypic, consensus):
   # Parse input arguments
   psm_files = []
   spectra = []
 
   for file in files:
-    if 'psms' in file:
+    if 'psmpkl' in file:
       psm_files.append(file)
     if 'peakpkl' in file:
       spectra.append(file)
 
   if len(psm_files) == 0:
-    raise click.ClickException("No PSMs files present. Need to have tag 'psms' in filename.")
+    raise click.ClickException("No PSMs files present. Need to have tag 'psmpkl' in filename.")
 
   if len(spectra) == 0:
     raise click.ClickException("No spectrum files present. Need to have tag 'peakpkl' in filename.")
@@ -219,7 +219,7 @@ def generate(files, outfile, psmtsv, peptidetsv, rt_referencefile, im_referencef
   psms_list = []
   for psm_file in psm_files:
     click.echo("Info: Reading file %s." % psm_file)
-    psm_tab = pd.read_csv(psm_file, index_col=False, sep='\t')
+    psm_tab = pd.read_pickle(psm_file)
     if psm_tab.shape[0] > 0:
       psms_list.append(psm_tab)
   psms = pd.concat(psms_list).reset_index(drop=True)
@@ -253,6 +253,12 @@ def generate(files, outfile, psmtsv, peptidetsv, rt_referencefile, im_referencef
       # Select reference run
       pepidr_stats = pepidr.groupby('base_name')[['modified_peptide']].count().reset_index()
       click.echo(pepidr_stats)
+
+      if im_filter is not None:
+        click.echo("Info: Filter candidate IM reference runs by tag '%s'." % im_filter)
+        pepidr_stats = pepidr_stats[pepidr_stats['base_name'].str.contains(im_filter)]
+        click.echo(pepidr_stats)
+
       im_reference_run_base_name = pepidr_stats.loc[pepidr_stats['modified_peptide'].idxmax()]['base_name']
 
       im_reference_run = pepidr[pepidr['base_name'] == im_reference_run_base_name].copy()
@@ -272,6 +278,12 @@ def generate(files, outfile, psmtsv, peptidetsv, rt_referencefile, im_referencef
     # Select reference run
     pepidr_stats = pepidr.groupby('base_name')[['modified_peptide']].count().reset_index()
     click.echo(pepidr_stats)
+
+    if rt_filter is not None:
+      click.echo("Info: Filter candidate RT reference runs by tag '%s'." % rt_filter)
+      pepidr_stats = pepidr_stats[pepidr_stats['base_name'].str.contains(rt_filter)]
+      click.echo(pepidr_stats)
+
     rt_reference_run_base_name = pepidr_stats.loc[pepidr_stats['modified_peptide'].idxmax()]['base_name']
 
     rt_reference_run = pepidr[pepidr['base_name'] == rt_reference_run_base_name].copy()

diff --git a/easypqp/main.py b/easypqp/main.py
@@ -26,9 +26,7 @@ def cli():
 @click.option('--spectra', 'spectralfile', required=True, type=click.Path(exists=True), help='The input mzXML or MGF (timsTOF only) file.')
 @click.option('--unimod', 'unimodfile', required=False, type=click.Path(exists=True), help='The input UniMod XML file.')
 @click.option('--psms', 'psmsfile', required=False, type=click.Path(exists=False), help='Output PSMs file.')
-@click.option('--subpsms', 'subpsmsfile', required=False, type=click.Path(exists=False), help='Output subsampled PSMs file.')
 @click.option('--peaks', 'peaksfile', required=False, type=click.Path(exists=False), help='Output peaks file.')
-@click.option('--main_score', default="var_expectscore", show_default=True, type=str, help='Main score to use for PyProphet.')
 @click.option('--exclude-range', 'exclude_range_str', default="-1.5,3.5", show_default=True, required=False, type=str, help='massdiff in this range will not be mapped to UniMod.')
 @click.option('--max_delta_unimod', default=0.02, show_default=True, type=float, help='Maximum delta mass (Dalton) for UniMod annotation.')
 @click.option('--max_delta_ppm', default=15, show_default=True, type=float, help='Maximum delta mass (PPM) for annotation.')
@@ -37,7 +35,7 @@ def cli():
 @click.option('--enable_specific_losses/--no-enable_specific_losses', default=False, show_default=True, help='Enable specific fragment ion losses.')
 @click.option('--enable_unspecific_losses/--no-enable_unspecific_losses', default=False, show_default=True, help='Enable unspecific fragment ion losses.')
 @click.option('--subsample_fraction', default=1.0, show_default=True, type=float, help='Data fraction used for subsampling.')
-def convert(pepxmlfile, spectralfile, unimodfile, psmsfile, subpsmsfile, peaksfile, main_score, exclude_range_str, max_delta_unimod, max_delta_ppm, fragment_types, fragment_charges, enable_specific_losses, enable_unspecific_losses, subsample_fraction):
+def convert(pepxmlfile, spectralfile, unimodfile, psmsfile, peaksfile, exclude_range_str, max_delta_unimod, max_delta_ppm, fragment_types, fragment_charges, enable_specific_losses, enable_unspecific_losses, subsample_fraction):
     """
     Convert pepXML files for EasyPQP
     """
@@ -47,26 +45,19 @@ def convert(pepxmlfile, spectralfile, unimodfile, psmsfile, subpsmsfile, peaksfi
 
     run_id = basename_spectralfile(spectralfile)
     if psmsfile is None:
-        psmsfile = run_id + "_psms.tsv"
-    if subpsmsfile is None:
-        subpsmsfile = run_id + "_subpsms.tsv"
+        psmsfile = run_id + ".psmpkl"
     if peaksfile is None:
         peaksfile = run_id + ".peakpkl"
 
     temp = exclude_range_str.split(',')
     exclude_range = [float(temp[0]), float(temp[1])]
 
     click.echo("Info: Converting %s." % pepxmlfile)
-    psms, peaks, tpp = conversion(pepxmlfile, spectralfile, unimodfile, main_score, exclude_range, max_delta_unimod, max_delta_ppm, fragment_types, fragment_charges, enable_specific_losses, enable_unspecific_losses)
+    psms, peaks = conversion(pepxmlfile, spectralfile, unimodfile, exclude_range, max_delta_unimod, max_delta_ppm, fragment_types, fragment_charges, enable_specific_losses, enable_unspecific_losses)
 
-    psms.to_csv(psmsfile, sep="\t", index=False)
+    psms.to_pickle(psmsfile)
     click.echo("Info: PSMs successfully converted and stored in %s." % psmsfile)
 
-    if not tpp:
-        subpsms = psms.sample(frac=subsample_fraction)
-        subpsms.to_csv(subpsmsfile, sep="\t", index=False)
-        click.echo("Info: Subsampled PSMs successfully converted and stored in %s." % subpsmsfile)
-
     peaks.to_pickle(peaksfile)
     click.echo("Info: Peaks successfully converted and stored in %s." % peaksfile)
 
@@ -77,7 +68,9 @@ def convert(pepxmlfile, spectralfile, unimodfile, psmsfile, subpsmsfile, peaksfi
 @click.option('--psmtsv', 'psmtsv', required=False, type=click.Path(exists=False), help='psm.tsv file from Philosopher.')
 @click.option('--peptidetsv', 'peptidetsv', required=False, type=click.Path(exists=False), help='peptide.tsv file from Philosopher.')
 @click.option('--rt_reference', 'rt_referencefile', required=False, type=click.Path(exists=True), help='Optional iRT/CiRT reference file.')
+@click.option('--rt_filter', 'rt_filter', required=False, type=str, help='Optional tag to filter candidate RT reference runs.')
 @click.option('--im_reference', 'im_referencefile', required=False, type=click.Path(exists=True), help='Optional IM reference file.')
+@click.option('--im_filter', 'im_filter', required=False, type=str, help='Optional tag to filter candidate IM reference runs.')
 @click.option('--psm_fdr_threshold', default=0.01, show_default=True, type=float, help='PSM FDR threshold.')
 @click.option('--peptide_fdr_threshold', default=0.01, show_default=True, type=float, help='Peptide FDR threshold.')
 @click.option('--protein_fdr_threshold', default=0.01, show_default=True, type=float, help='Protein FDR threshold.')
@@ -91,12 +84,12 @@ def convert(pepxmlfile, spectralfile, unimodfile, psmsfile, subpsmsfile, peaksfi
 @click.option('--min_peptides', default=5, show_default=True, type=int, help='Minimum peptides required for successful alignment.')
 @click.option('--proteotypic/--no-proteotypic', show_default=True, default=True, help='Use only proteotypic, unique, non-shared peptides.')
 @click.option('--consensus/--no-consensus', show_default=True, default=True, help='Generate consensus instead of best replicate spectra.')
-def library(infiles, outfile, psmtsv, peptidetsv, rt_referencefile, im_referencefile, psm_fdr_threshold, peptide_fdr_threshold, protein_fdr_threshold, rt_lowess_fraction, rt_psm_fdr_threshold, im_lowess_fraction, im_psm_fdr_threshold, pi0_lambda, peptide_plot_path, protein_plot_path, min_peptides, proteotypic, consensus):
+def library(infiles, outfile, psmtsv, peptidetsv, rt_referencefile, rt_filter, im_referencefile, im_filter, psm_fdr_threshold, peptide_fdr_threshold, protein_fdr_threshold, rt_lowess_fraction, rt_psm_fdr_threshold, im_lowess_fraction, im_psm_fdr_threshold, pi0_lambda, peptide_plot_path, protein_plot_path, min_peptides, proteotypic, consensus):
     """
     Generate EasyPQP library
     """
 
-    generate(infiles, outfile, psmtsv, peptidetsv, rt_referencefile, im_referencefile, psm_fdr_threshold, peptide_fdr_threshold, protein_fdr_threshold, rt_lowess_fraction, rt_psm_fdr_threshold, im_lowess_fraction, im_psm_fdr_threshold, pi0_lambda, peptide_plot_path, protein_plot_path, min_peptides, proteotypic, consensus)
+    generate(infiles, outfile, psmtsv, peptidetsv, rt_referencefile, rt_filter, im_referencefile, im_filter, psm_fdr_threshold, peptide_fdr_threshold, protein_fdr_threshold, rt_lowess_fraction, rt_psm_fdr_threshold, im_lowess_fraction, im_psm_fdr_threshold, pi0_lambda, peptide_plot_path, protein_plot_path, min_peptides, proteotypic, consensus)
     click.echo("Info: Library successfully generated.")
 
 # EasyPQP Reduce

diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name='easypqp',
-    version='0.1.4',
+    version='0.1.5',
     description='EasyPQP: Simple library generation for OpenSWATH',
     long_description=long_description,
     long_description_content_type='text/markdown',