diff --git a/easypqp/convert.py b/easypqp/convert.py index 94480af..8df0307 100755 --- a/easypqp/convert.py +++ b/easypqp/convert.py @@ -460,7 +460,7 @@ def generate_ionseries(peptide_sequence, precursor_charge, fragment_charges=[1,2 return list(fragments.keys()), np.fromiter(fragments.values(), np.float, len(fragments)) -def conversion(pepxmlfile, spectralfile, unimodfile, main_score, exclude_range, max_delta_unimod, max_delta_ppm, fragment_types, fragment_charges, enable_specific_losses, enable_unspecific_losses): +def conversion(pepxmlfile, spectralfile, unimodfile, exclude_range, max_delta_unimod, max_delta_ppm, fragment_types, fragment_charges, enable_specific_losses, enable_unspecific_losses): # Parse basename base_name = basename_spectralfile(spectralfile) click.echo("Info: Parsing run %s." % base_name) @@ -469,46 +469,12 @@ def conversion(pepxmlfile, spectralfile, unimodfile, main_score, exclude_range, um = unimod(unimodfile, max_delta_unimod) # Parse pepXML + click.echo("Info: Parsing pepXML.") px = pepxml(pepxmlfile, um, base_name, exclude_range) psms = px.get() # Continue if any PSMS are present if psms.shape[0] > 0: - # Generate UniMod peptide sequence - click.echo("Info: Matching modifications to UniMod.") - - # Append PyProphet columns - run_id = basename_spectralfile(spectralfile) - psms['group_id'] = psms['run_id'] + "_" + psms['scan_id'].astype(str) - - if 'var_expect' in psms.columns: - psms = psms.rename(index=str, columns={'var_expect': 'expect'}) - psms['var_expectscore'] = 0.0 - np.log(psms['expect']) - - if 'var_nextscore' in psms.columns and 'var_hyperscore' in psms.columns: - psms = psms.rename(index=str, columns={'var_nextscore': 'nextscore'}) - psms['var_deltascore'] = 1.0 - (psms['nextscore'] / psms['var_hyperscore']) - - # DIA-Umpire quality tiers - if run_id.endswith("_Q1"): - psms['quality'] = 1 - elif run_id.endswith("_Q2"): - psms['quality'] = 2 - elif run_id.endswith("_Q3"): - psms['quality'] = 3 - else: # DDA data - psms['quality'] = 0 - - if main_score not in psms.columns: - raise click.ClickException("Error: Main score '%s' is not present in pepXML." % main_score) - - psms = psms.rename(index=str, columns={main_score: 'main_' + main_score}) - - # Check if pepXML is processed by TPP - if 'pep' in psms.columns: - tpp = True - else: - tpp = False # Generate theoretical spectra click.echo("Info: Generate theoretical spectra.") @@ -529,9 +495,9 @@ def conversion(pepxmlfile, spectralfile, unimodfile, main_score, exclude_range, # Round floating numbers peaks = peaks.round(6) - return psms, peaks, tpp + return psms, peaks else: - return pd.DataFrame({'run_id': [], 'scan_id': [], 'hit_rank': [], 'massdiff': [], 'precursor_charge': [], 'retention_time': [], 'ion_mobility': [], 'peptide_sequence': [], 'modifications': [], 'nterm_modification': [], 'cterm_modification': [], 'protein_id': [], 'gene_id': [], 'num_tot_proteins': [], 'decoy': []}), pd.DataFrame({'scan_id': [], 'modified_peptide': [], 'precursor_charge': [], 'precursor_mz': [], 'fragment': [], 'product_mz': [], 'intensity': []}), True + return pd.DataFrame({'run_id': [], 'scan_id': [], 'hit_rank': [], 'massdiff': [], 'precursor_charge': [], 'retention_time': [], 'ion_mobility': [], 'peptide_sequence': [], 'modifications': [], 'nterm_modification': [], 'cterm_modification': [], 'protein_id': [], 'gene_id': [], 'num_tot_proteins': [], 'decoy': []}), pd.DataFrame({'scan_id': [], 'modified_peptide': [], 'precursor_charge': [], 'precursor_mz': [], 'fragment': [], 'product_mz': [], 'intensity': []}) def basename_spectralfile(spectralfile): ''' diff --git a/easypqp/library.py b/easypqp/library.py index 1a79a1b..1297b45 100644 --- a/easypqp/library.py +++ b/easypqp/library.py @@ -190,19 +190,19 @@ def lowess(run, reference_run, xcol, ycol, lowess_frac, psm_fdr_threshold, min_p return run -def generate(files, outfile, psmtsv, peptidetsv, rt_referencefile, im_referencefile, psm_fdr_threshold, peptide_fdr_threshold, protein_fdr_threshold, rt_lowess_frac, rt_psm_fdr_threshold, im_lowess_frac, im_psm_fdr_threshold, pi0_lambda, peptide_plot_path, protein_plot_path, min_peptides, proteotypic, consensus): +def generate(files, outfile, psmtsv, peptidetsv, rt_referencefile, rt_filter, im_referencefile, im_filter, psm_fdr_threshold, peptide_fdr_threshold, protein_fdr_threshold, rt_lowess_frac, rt_psm_fdr_threshold, im_lowess_frac, im_psm_fdr_threshold, pi0_lambda, peptide_plot_path, protein_plot_path, min_peptides, proteotypic, consensus): # Parse input arguments psm_files = [] spectra = [] for file in files: - if 'psms' in file: + if 'psmpkl' in file: psm_files.append(file) if 'peakpkl' in file: spectra.append(file) if len(psm_files) == 0: - raise click.ClickException("No PSMs files present. Need to have tag 'psms' in filename.") + raise click.ClickException("No PSMs files present. Need to have tag 'psmpkl' in filename.") if len(spectra) == 0: raise click.ClickException("No spectrum files present. Need to have tag 'peakpkl' in filename.") @@ -219,7 +219,7 @@ def generate(files, outfile, psmtsv, peptidetsv, rt_referencefile, im_referencef psms_list = [] for psm_file in psm_files: click.echo("Info: Reading file %s." % psm_file) - psm_tab = pd.read_csv(psm_file, index_col=False, sep='\t') + psm_tab = pd.read_pickle(psm_file) if psm_tab.shape[0] > 0: psms_list.append(psm_tab) psms = pd.concat(psms_list).reset_index(drop=True) @@ -253,6 +253,12 @@ def generate(files, outfile, psmtsv, peptidetsv, rt_referencefile, im_referencef # Select reference run pepidr_stats = pepidr.groupby('base_name')[['modified_peptide']].count().reset_index() click.echo(pepidr_stats) + + if im_filter is not None: + click.echo("Info: Filter candidate IM reference runs by tag '%s'." % im_filter) + pepidr_stats = pepidr_stats[pepidr_stats['base_name'].str.contains(im_filter)] + click.echo(pepidr_stats) + im_reference_run_base_name = pepidr_stats.loc[pepidr_stats['modified_peptide'].idxmax()]['base_name'] im_reference_run = pepidr[pepidr['base_name'] == im_reference_run_base_name].copy() @@ -272,6 +278,12 @@ def generate(files, outfile, psmtsv, peptidetsv, rt_referencefile, im_referencef # Select reference run pepidr_stats = pepidr.groupby('base_name')[['modified_peptide']].count().reset_index() click.echo(pepidr_stats) + + if rt_filter is not None: + click.echo("Info: Filter candidate RT reference runs by tag '%s'." % rt_filter) + pepidr_stats = pepidr_stats[pepidr_stats['base_name'].str.contains(rt_filter)] + click.echo(pepidr_stats) + rt_reference_run_base_name = pepidr_stats.loc[pepidr_stats['modified_peptide'].idxmax()]['base_name'] rt_reference_run = pepidr[pepidr['base_name'] == rt_reference_run_base_name].copy() diff --git a/easypqp/main.py b/easypqp/main.py index 7456cb8..ccd8bce 100644 --- a/easypqp/main.py +++ b/easypqp/main.py @@ -26,9 +26,7 @@ def cli(): @click.option('--spectra', 'spectralfile', required=True, type=click.Path(exists=True), help='The input mzXML or MGF (timsTOF only) file.') @click.option('--unimod', 'unimodfile', required=False, type=click.Path(exists=True), help='The input UniMod XML file.') @click.option('--psms', 'psmsfile', required=False, type=click.Path(exists=False), help='Output PSMs file.') -@click.option('--subpsms', 'subpsmsfile', required=False, type=click.Path(exists=False), help='Output subsampled PSMs file.') @click.option('--peaks', 'peaksfile', required=False, type=click.Path(exists=False), help='Output peaks file.') -@click.option('--main_score', default="var_expectscore", show_default=True, type=str, help='Main score to use for PyProphet.') @click.option('--exclude-range', 'exclude_range_str', default="-1.5,3.5", show_default=True, required=False, type=str, help='massdiff in this range will not be mapped to UniMod.') @click.option('--max_delta_unimod', default=0.02, show_default=True, type=float, help='Maximum delta mass (Dalton) for UniMod annotation.') @click.option('--max_delta_ppm', default=15, show_default=True, type=float, help='Maximum delta mass (PPM) for annotation.') @@ -37,7 +35,7 @@ def cli(): @click.option('--enable_specific_losses/--no-enable_specific_losses', default=False, show_default=True, help='Enable specific fragment ion losses.') @click.option('--enable_unspecific_losses/--no-enable_unspecific_losses', default=False, show_default=True, help='Enable unspecific fragment ion losses.') @click.option('--subsample_fraction', default=1.0, show_default=True, type=float, help='Data fraction used for subsampling.') -def convert(pepxmlfile, spectralfile, unimodfile, psmsfile, subpsmsfile, peaksfile, main_score, exclude_range_str, max_delta_unimod, max_delta_ppm, fragment_types, fragment_charges, enable_specific_losses, enable_unspecific_losses, subsample_fraction): +def convert(pepxmlfile, spectralfile, unimodfile, psmsfile, peaksfile, exclude_range_str, max_delta_unimod, max_delta_ppm, fragment_types, fragment_charges, enable_specific_losses, enable_unspecific_losses, subsample_fraction): """ Convert pepXML files for EasyPQP """ @@ -47,9 +45,7 @@ def convert(pepxmlfile, spectralfile, unimodfile, psmsfile, subpsmsfile, peaksfi run_id = basename_spectralfile(spectralfile) if psmsfile is None: - psmsfile = run_id + "_psms.tsv" - if subpsmsfile is None: - subpsmsfile = run_id + "_subpsms.tsv" + psmsfile = run_id + ".psmpkl" if peaksfile is None: peaksfile = run_id + ".peakpkl" @@ -57,16 +53,11 @@ def convert(pepxmlfile, spectralfile, unimodfile, psmsfile, subpsmsfile, peaksfi exclude_range = [float(temp[0]), float(temp[1])] click.echo("Info: Converting %s." % pepxmlfile) - psms, peaks, tpp = conversion(pepxmlfile, spectralfile, unimodfile, main_score, exclude_range, max_delta_unimod, max_delta_ppm, fragment_types, fragment_charges, enable_specific_losses, enable_unspecific_losses) + psms, peaks = conversion(pepxmlfile, spectralfile, unimodfile, exclude_range, max_delta_unimod, max_delta_ppm, fragment_types, fragment_charges, enable_specific_losses, enable_unspecific_losses) - psms.to_csv(psmsfile, sep="\t", index=False) + psms.to_pickle(psmsfile) click.echo("Info: PSMs successfully converted and stored in %s." % psmsfile) - if not tpp: - subpsms = psms.sample(frac=subsample_fraction) - subpsms.to_csv(subpsmsfile, sep="\t", index=False) - click.echo("Info: Subsampled PSMs successfully converted and stored in %s." % subpsmsfile) - peaks.to_pickle(peaksfile) click.echo("Info: Peaks successfully converted and stored in %s." % peaksfile) @@ -77,7 +68,9 @@ def convert(pepxmlfile, spectralfile, unimodfile, psmsfile, subpsmsfile, peaksfi @click.option('--psmtsv', 'psmtsv', required=False, type=click.Path(exists=False), help='psm.tsv file from Philosopher.') @click.option('--peptidetsv', 'peptidetsv', required=False, type=click.Path(exists=False), help='peptide.tsv file from Philosopher.') @click.option('--rt_reference', 'rt_referencefile', required=False, type=click.Path(exists=True), help='Optional iRT/CiRT reference file.') +@click.option('--rt_filter', 'rt_filter', required=False, type=str, help='Optional tag to filter candidate RT reference runs.') @click.option('--im_reference', 'im_referencefile', required=False, type=click.Path(exists=True), help='Optional IM reference file.') +@click.option('--im_filter', 'im_filter', required=False, type=str, help='Optional tag to filter candidate IM reference runs.') @click.option('--psm_fdr_threshold', default=0.01, show_default=True, type=float, help='PSM FDR threshold.') @click.option('--peptide_fdr_threshold', default=0.01, show_default=True, type=float, help='Peptide FDR threshold.') @click.option('--protein_fdr_threshold', default=0.01, show_default=True, type=float, help='Protein FDR threshold.') @@ -91,12 +84,12 @@ def convert(pepxmlfile, spectralfile, unimodfile, psmsfile, subpsmsfile, peaksfi @click.option('--min_peptides', default=5, show_default=True, type=int, help='Minimum peptides required for successful alignment.') @click.option('--proteotypic/--no-proteotypic', show_default=True, default=True, help='Use only proteotypic, unique, non-shared peptides.') @click.option('--consensus/--no-consensus', show_default=True, default=True, help='Generate consensus instead of best replicate spectra.') -def library(infiles, outfile, psmtsv, peptidetsv, rt_referencefile, im_referencefile, psm_fdr_threshold, peptide_fdr_threshold, protein_fdr_threshold, rt_lowess_fraction, rt_psm_fdr_threshold, im_lowess_fraction, im_psm_fdr_threshold, pi0_lambda, peptide_plot_path, protein_plot_path, min_peptides, proteotypic, consensus): +def library(infiles, outfile, psmtsv, peptidetsv, rt_referencefile, rt_filter, im_referencefile, im_filter, psm_fdr_threshold, peptide_fdr_threshold, protein_fdr_threshold, rt_lowess_fraction, rt_psm_fdr_threshold, im_lowess_fraction, im_psm_fdr_threshold, pi0_lambda, peptide_plot_path, protein_plot_path, min_peptides, proteotypic, consensus): """ Generate EasyPQP library """ - generate(infiles, outfile, psmtsv, peptidetsv, rt_referencefile, im_referencefile, psm_fdr_threshold, peptide_fdr_threshold, protein_fdr_threshold, rt_lowess_fraction, rt_psm_fdr_threshold, im_lowess_fraction, im_psm_fdr_threshold, pi0_lambda, peptide_plot_path, protein_plot_path, min_peptides, proteotypic, consensus) + generate(infiles, outfile, psmtsv, peptidetsv, rt_referencefile, rt_filter, im_referencefile, im_filter, psm_fdr_threshold, peptide_fdr_threshold, protein_fdr_threshold, rt_lowess_fraction, rt_psm_fdr_threshold, im_lowess_fraction, im_psm_fdr_threshold, pi0_lambda, peptide_plot_path, protein_plot_path, min_peptides, proteotypic, consensus) click.echo("Info: Library successfully generated.") # EasyPQP Reduce diff --git a/setup.py b/setup.py index 0066e7a..42fdba9 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name='easypqp', - version='0.1.4', + version='0.1.5', description='EasyPQP: Simple library generation for OpenSWATH', long_description=long_description, long_description_content_type='text/markdown',