Skip to content

Commit

Permalink
Add GUI support for new version
Browse files Browse the repository at this point in the history
  • Loading branch information
RobbinBouwmeester committed May 8, 2023
1 parent 2b4057f commit d5a9168
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 71 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to
[Semantic Versioning](https://semver.org/spec/v2.0.0.html).

# [2.1.0] - 2023-05-08

### Changed
- Support for GUI and PSM utils
- Support for GUI and transfer learning

## [2.0.4] - 2023-05-06

### Changed
Expand Down
101 changes: 65 additions & 36 deletions deeplc/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
from deeplc._argument_parser import parse_arguments
from deeplc._exceptions import DeepLCError

from psm_utils.io.peptide_record import peprec_to_proforma
from psm_utils.psm import PSM
from psm_utils.psm_list import PSMList
from psm_utils.io import read_file
from psm_utils.io import write_file

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -72,7 +77,6 @@ def run(
file_pred,
file_cal=None,
file_pred_out=None,
plot_predictions=False,
file_model=None,
pygam_calibration=True,
split_cal=50,
Expand All @@ -81,13 +85,19 @@ def run(
write_library=False,
batch_num=50000,
n_threads=None,
transfer_learning=False,
log_level="info",
verbose=True,
):
"""Run DeepLC."""
logger.info("Using DeepLC version %s", __version__)
logger.debug("Using %i CPU threads", n_threads)

df_pred = False
df_cal = False
first_line_pred = ""
first_line_cal = ""

if not file_cal and file_model != None:
fm_dict = {}
sel_group = ""
Expand All @@ -100,19 +110,49 @@ def run(
if m_group == sel_group:
fm_dict[m_group] = fm
file_model = fm_dict

# Read input files
df_pred = pd.read_csv(file_pred)
if len(df_pred.columns) < 2:
df_pred = pd.read_csv(file_pred,sep=" ")
df_pred = df_pred.fillna("")


with open(file_pred) as f:
first_line_pred = f.readline()
if file_cal:
with open(file_cal) as f:
first_line_cal = f.readline()

if "modifications" in first_line_pred.split(",") and "seq" in first_line_pred.split(","):
# Read input files
df_pred = pd.read_csv(file_pred)
if len(df_pred.columns) < 2:
df_pred = pd.read_csv(file_pred,sep=" ")
df_pred = df_pred.fillna("")
file_pred = ""

list_of_psms = []
for seq,mod,ident in zip(df_pred["seq"],df_pred["modifications"],df_pred.index):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident))
psm_list_pred = PSMList(psm_list=list_of_psms)
df_pred = None
else:
psm_list_pred = read_file(file_pred)
if "msms" in file_pred and ".txt" in file_pred:
mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict()
psm_list_pred.rename_modifications(mapper)

if "modifications" in first_line_cal.split(",") and "seq" in first_line_cal.split(",") and file_cal:
df_cal = pd.read_csv(file_cal)
if len(df_cal.columns) < 2:
df_cal = pd.read_csv(df_cal,sep=" ")
df_cal = df_cal.fillna("")

file_cal = ""

list_of_psms = []
for seq,mod,ident,tr in zip(df_cal["seq"],df_cal["modifications"],df_cal.index,df_cal["tr"]):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr))
psm_list_cal = PSMList(psm_list=list_of_psms)
df_cal = None
elif file_cal:
psm_list_cal = read_file(file_cal)
if "msms" in file_cal and ".txt" in file_cal:
mapper = pd.read_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)), "unimod/map_mq_file.csv"),index_col=0)["value"].to_dict()
psm_list_cal.rename_modifications(mapper)
# Make a feature extraction object; you can skip this if you do not want to
# use the default settings for DeepLC. Here we want to use a model that does
# not use RDKit features so we skip the chemical descriptor making
Expand All @@ -121,7 +161,7 @@ def run(
cnn_feats=True,
verbose=verbose
)

# Make the DeepLC object that will handle making predictions and calibration
dlc = DeepLC(
path_model=file_model,
Expand All @@ -134,42 +174,31 @@ def run(
batch_num=batch_num,
n_jobs=n_threads,
verbose=verbose,
deeplc_retrain=transfer_learning
)


# Calibrate the original model based on the new retention times
if file_cal:
if len(psm_list_cal) > 0:
logger.info("Selecting best model and calibrating predictions...")
dlc.calibrate_preds(seq_df=df_cal)
print(psm_list_cal)
dlc.calibrate_preds(psm_list=psm_list_cal)

# Make predictions; calibrated or uncalibrated
logger.info("Making predictions using model: %s", dlc.model)
if file_cal:
preds = dlc.make_preds(seq_df=df_pred)
preds = dlc.make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred)
else:
preds = dlc.make_preds(seq_df=df_pred, calibrate=False)

df_pred["predicted_tr"] = preds
preds = dlc.make_preds(seq_df=df_pred, infile=file_pred, psm_list=psm_list_pred, calibrate=False)
#df_pred["predicted_tr"] = preds
logger.info("Writing predictions to file: %s", file_pred_out)
df_pred.to_csv(file_pred_out)

if plot_predictions:
if file_cal and "tr" in df_pred.columns:
file_pred_figure = os.path.splitext(file_pred_out)[0] + '.png'
logger.info(
"Saving scatterplot of predictions to file: %s",
file_pred_figure
)
plt.figure(figsize=(11.5, 9))
plt.scatter(df_pred["tr"], df_pred["predicted_tr"], s=3)
plt.title("DeepLC predictions")
plt.xlabel("Observed retention times")
plt.ylabel("Predicted retention times")
plt.savefig(file_pred_figure, dpi=300)
else:
logger.warning(
"No observed retention time in input data. Cannot plot "
"predictions."
)

file_pred_out = open(file_pred_out,"w")
file_pred_out.write("Sequence proforma,predicted retention time\n")
for psm,tr in zip(psm_list_pred,preds):
file_pred_out.write(f"{psm.peptidoform.proforma},{tr}\n")
file_pred_out.close()

logger.info("DeepLC finished!")

Expand Down
30 changes: 20 additions & 10 deletions deeplc/_argument_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def parse_arguments(gui=False):
"gooey_options": {"checkbox_label": "Use pyGAM calibration"},
"metavar": "Use pyGAM calibration"
},
"legacy_calibration": {
"gooey_options": {"checkbox_label": "Use legacy calibration"},
"metavar": "Use legacy calibration"
"transfer_learning": {
"gooey_options": {"checkbox_label": "Use transfer learning calibration"},
"metavar": "Use transfer learning calibration"
},
"split_cal": {"gooey_options": {"visible": False}},
"dict_divider": {"gooey_options": {"visible": False}},
Expand Down Expand Up @@ -116,13 +116,6 @@ def parse_arguments(gui=False):
help="path to write output file with predictions",
**gooey_args["file_pred_out"]
)
io_args.add_argument(
"--plot_predictions",
action='store_true',
default=False,
help="save scatter plot of predictions vs observations",
**gooey_args["plot_predictions"]
)

model_cal_args = parser.add_argument_group(
"Model and calibration",
Expand Down Expand Up @@ -153,13 +146,24 @@ def parse_arguments(gui=False):
),
**gooey_args["pygam_calibration"]
)

calibration_group.add_argument(
"--transfer_learning",
dest="transfer_learning",
action="store_false",
help="use transfer learning as calibration method",
**gooey_args["transfer_learning"]
)

"""
calibration_group.add_argument(
"--legacy_calibration",
dest="pygam_calibration",
action="store_false",
help="use legacy simple piecewise linear fit as calibration method",
**gooey_args["legacy_calibration"]
)
"""

model_cal_args.add_argument(
"--split_cal",
Expand Down Expand Up @@ -190,6 +194,7 @@ def parse_arguments(gui=False):
advanced_args = parser.add_argument_group(
"Advanced configuration", **gooey_args["advanced_args"]
)
"""
advanced_args.add_argument(
"--use_library",
dest="use_library",
Expand All @@ -202,6 +207,9 @@ def parse_arguments(gui=False):
),
**gooey_args["use_library"]
)
"""

"""
advanced_args.add_argument(
"--write_library",
dest="write_library",
Expand All @@ -210,6 +218,8 @@ def parse_arguments(gui=False):
help="append new predictions to library for faster future results",
**gooey_args["write_library"]
)
"""

advanced_args.add_argument(
"--batch_num",
type=int,
Expand Down
40 changes: 16 additions & 24 deletions deeplc/deeplc.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,11 +326,11 @@ def do_f_extraction(self, seqs, mods, identifiers, charges=[]):
"""
list_of_psms = []
if len(charges) > 0:
for seq,mod,id in zip(seqs,mods,identifiers):
list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=id))
for seq,mod,ident in zip(seqs,mods,identifiers):
list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident))
else:
for seq,mod,id,z in zip(seqs,mods,identifiers,charges):
list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=id))
for seq,mod,ident,z in zip(seqs,mods,identifiers,charges):
list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident))

psm_list = PSMList(psm_list=list_of_psms)

Expand Down Expand Up @@ -358,11 +358,11 @@ def do_f_extraction_pd(self,

list_of_psms = []
if len(charges) == 0:
for seq,mod,id in zip(df_instances["seq"],df_instances["modifications"],df_instances.index):
list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=id))
for seq,mod,ident in zip(df_instances["seq"],df_instances["modifications"],df_instances.index):
list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident))
else:
for seq,mod,id,z in zip(df_instances["seq"],df_instances["modifications"],df_instances.index,charges=df_instances["charges"]):
list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=id))
for seq,mod,ident,z in zip(df_instances["seq"],df_instances["modifications"],df_instances.index,charges=df_instances["charges"]):
list_of_psms.append(PSM(peptide=peprec_to_proforma(seq,mod),spectrum_id=ident))
psm_list = PSMList(psm_list=list_of_psms)

return self.f_extractor.full_feat_extract(psm_list)
Expand Down Expand Up @@ -680,8 +680,8 @@ def make_preds(self,
"""
if type(seq_df) == pd.core.frame.DataFrame:
list_of_psms = []
for seq,mod,id in zip(seq_df["seq"],seq_df["modifications"],seq_df.index):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=id))
for seq,mod,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df.index):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident))
psm_list = PSMList(psm_list=list_of_psms)

if len(infile) > 0:
Expand All @@ -695,15 +695,7 @@ def make_preds(self,
logger.debug("Extracting features for the CNN model ...")

X = self.do_f_extraction_psm_list_parallel(psm_list)
#X = self.do_f_extraction_psm_list(psm_list)

X_sum = np.stack(X["matrix_sum"].values())
#print(np.stack(X["matrix_all"].values()))
#print(X["matrix_all"].values())
#input("s")
#print(X["pos_matrix"].values())
#print(np.stack(X["pos_matrix"].values()))
#print("s2")
X_global = np.concatenate((np.stack(X["matrix_all"].values()),
np.stack(X["pos_matrix"].values())),
axis=1)
Expand Down Expand Up @@ -760,8 +752,8 @@ def calibrate_preds_func_pygam(self,
# TODO make sure either psm_list or seq_df is supplied
if type(seq_df) == pd.core.frame.DataFrame:
list_of_psms = []
for seq,mod,id,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=id,retention_time=tr))
for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr))
psm_list = PSMList(psm_list=list_of_psms)

measured_tr = [psm.retention_time for psm in psm_list]
Expand Down Expand Up @@ -837,8 +829,8 @@ def calibrate_preds_func(self,
"""
if type(seq_df) == pd.core.frame.DataFrame:
list_of_psms = []
for seq,mod,id in zip(seq_df["seq"],seq_df["modifications"],seq_df.index):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=id))
for seq,mod,ident in zip(seq_df["seq"],seq_df["modifications"],seq_df.index):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident))
psm_list = PSMList(psm_list=list_of_psms)

predicted_tr = self.make_preds(
Expand Down Expand Up @@ -982,8 +974,8 @@ def calibrate_preds(self,
"""
if type(seq_df) == pd.core.frame.DataFrame:
list_of_psms = []
for seq,mod,id,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=id,retention_time=tr))
for seq,mod,ident,tr in zip(seq_df["seq"],seq_df["modifications"],seq_df.index,seq_df["tr"]):
list_of_psms.append(PSM(peptidoform=peprec_to_proforma(seq,mod),spectrum_id=ident,retention_time=tr))
psm_list = PSMList(psm_list=list_of_psms)


Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

setup(
name='deeplc',
version='2.0.4',
version='2.1.0',
license='apache-2.0',
description='DeepLC: Retention time prediction for (modified) peptides using Deep Learning.',
long_description=LONG_DESCRIPTION,
Expand Down

0 comments on commit d5a9168

Please sign in to comment.