diff --git a/sdrf_pipelines/maxquant/maxquant.py b/sdrf_pipelines/maxquant/maxquant.py index df7f0cb..a8f4200 100644 --- a/sdrf_pipelines/maxquant/maxquant.py +++ b/sdrf_pipelines/maxquant/maxquant.py @@ -11,6 +11,7 @@ from xml.dom.minidom import parse from xml.dom.minidom import Document from datetime import datetime +import yaml import os import numpy as np import pkg_resources @@ -22,6 +23,8 @@ def __init__(self) -> None: super().__init__() self.warnings = dict() self.modfile = pkg_resources.resource_filename(__name__, "modifications.xml") + self.datparamfile = pkg_resources.resource_filename(__name__, "param2sdrf.yml") + print(self.datparamfile) def create_new_mods(self, mods, mqconfdir): i = 0 @@ -645,6 +648,14 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns sdrf = sdrf.astype(str) sdrf.columns = map(str.lower, sdrf.columns) # convert column names to lower-case + with open(self.datparamfile) as file: + param_mapping = yaml.safe_load(file) + mapping = param_mapping["parameters"] + datparams = dict() + for i in mapping: + datparams[i["sdrf"]] = i["name"] + + # map filename to tuple of [fixed, variable] mods mod_cols = [c for ind, c in enumerate(sdrf) if c.startswith('comment[modification parameters')] # columns with modification parameters @@ -670,6 +681,11 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns source_name2n_reps = dict() file2technical_rep = dict() file2instrument = dict() + # New parameters from extended SDRF (including data analysis parameters) + file2params = dict() + for p in datparams.values(): + file2params[p] = dict() + if mqconfdir: self.create_new_mods(sdrf[mod_cols], mqconfdir) @@ -760,6 +776,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns file2fragtol[raw] = "20" file2fragtolunit[raw] = "ppm" + # STILL NOT IMPLEMENTED! if 'comment[dissociation method]' in row: if row['comment[dissociation method]'] == 'not available': file2diss[raw] = 'HCD' @@ -999,6 +1016,15 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns file2label[raw] = lt + # Reading data analysis parameters + for p in datparams.keys(): + comment_p = 'comment[' + p + ']' + if comment_p in row: + file2params[datparams[p]][raw] = row[comment_p] + + + + # create maxquant parameters xml file doc = Document() @@ -1144,7 +1170,14 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns root.appendChild(secondPeptide) matchBetweenRuns_node = doc.createElement('matchBetweenRuns') - matchBetweenRuns_node.appendChild(doc.createTextNode(matchBetweenRuns)) + if (len(file2params["match_between_runs"]) > 0): + first = list(file2params["match_between_runs"].values())[0] + matchBetweenRuns_node.appendChild(doc.createTextNode(first)) + if(len(set(file2params["match_between_runs"].values())) > 1): + warning_message = "multiple values for match between runs, taking the first: " + first + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + else: + matchBetweenRuns_node.appendChild(doc.createTextNode(matchBetweenRuns)) root.appendChild(matchBetweenRuns_node) matchUnidentifiedFeatures = doc.createElement('matchUnidentifiedFeatures') @@ -1259,7 +1292,15 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns root.appendChild(intensityPredictionsFile) minPepLen = doc.createElement('minPepLen') - minPepLen.appendChild(doc.createTextNode('7')) + tparam = file2params["min_peptide_length"] + if (len(tparam) > 0): + first = list(tparam.values())[0] + minPepLen.appendChild(doc.createTextNode(first)) + if(len(set(tparam.values())) > 1): + warning_message = "multiple values for parameter minimum peptide length, taking the first: " + first + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + else: + minPepLen.appendChild(doc.createTextNode('7')) root.appendChild(minPepLen) psmFdrCrosslink = doc.createElement('psmFdrCrosslink') @@ -1267,15 +1308,45 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns root.appendChild(psmFdrCrosslink) peptideFdr = doc.createElement('peptideFdr') - peptideFdr.appendChild(doc.createTextNode(str(peptideFDR))) + tparam = file2params["fdr_peptide"] + if (len(tparam) > 0): + first = list(tparam.values())[0] + warning_message = "overwriting pepptide FDR using the value in the sdrf file" + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + peptideFdr.appendChild(doc.createTextNode(first)) + if(len(set(tparam.values())) > 1): + warning_message = "multiple values for parameter Peptide FDR, taking the first: " + first + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + else: + peptideFdr.appendChild(doc.createTextNode(str(peptideFDR))) root.appendChild(peptideFdr) proteinFdr = doc.createElement('proteinFdr') - proteinFdr.appendChild(doc.createTextNode(str(proteinFDR))) + tparam = file2params["fdr_protein"] + if (len(tparam) > 0): + first = list(tparam.values())[0] + warning_message = "overwriting protein FDR using the value in the sdrf file" + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + proteinFdr.appendChild(doc.createTextNode(first)) + if(len(set(tparam.values())) > 1): + warning_message = "multiple values for parameter Protein FDR, taking the first: " + first + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + else: + proteinFdr.appendChild(doc.createTextNode(str(proteinFDR))) root.appendChild(proteinFdr) siteFdr = doc.createElement('siteFdr') - siteFdr.appendChild(doc.createTextNode('0.01')) + tparam = file2params["fdr_psm"] + if (len(tparam) > 0): + first = list(tparam.values())[0] + warning_message = "overwriting PSM FDR using the value in the sdrf file" + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + siteFdr.appendChild(doc.createTextNode(first)) + if(len(set(tparam.values())) > 1): + warning_message = "multiple values for parameter PSM FDR, taking the first: " + first + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + else: + siteFdr.appendChild(doc.createTextNode('0.01')) root.appendChild(siteFdr) minPeptideLengthForUnspecificSearch = doc.createElement('minPeptideLengthForUnspecificSearch') @@ -1291,7 +1362,15 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns root.appendChild(useNormRatiosForOccupancy) minPeptides = doc.createElement('minPeptides') - minPeptides.appendChild(doc.createTextNode('1')) + tparam = file2params["min_num_peptides"] + if (len(tparam) > 0): + first = list(tparam.values())[0] + minPeptides.appendChild(doc.createTextNode(first)) + if(len(set(tparam.values())) > 1): + warning_message = "multiple values for parameter minimum number of peptides, taking the first: " + first + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + else: + minPeptides.appendChild(doc.createTextNode('1')) root.appendChild(minPeptides) minRazorPeptides = doc.createElement('minRazorPeptides') @@ -1362,8 +1441,23 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns compositionPrediction.appendChild(doc.createTextNode('0')) root.appendChild(compositionPrediction) + quantMode = doc.createElement('quantMode') - quantMode.appendChild(doc.createTextNode('1')) + tparam = file2params["quantification_method"] + if (len(tparam) > 0): + first = list(tparam.values())[0] + if first == "unique": + first = '2' + elif first == "shared": + first = '0' + else: + first = '1' + quantMode.appendChild(doc.createTextNode(first)) + if(len(set(tparam.values())) > 1): + warning_message = "multiple values for parameter Quantification mode, taking the first: " + first + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + else: + quantMode.appendChild(doc.createTextNode('1')) root.appendChild(quantMode) massDifferenceMods = doc.createElement('massDifferenceMods') @@ -1536,22 +1630,22 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns referenceChannel = doc.createElement('referenceChannel') for key1, instr_val in file2instrument.items(): - value2 = str(file2enzyme[key1]) + file2label[key1] + str(file2mods[key1]) + str(file2pctol) \ - + str(file2fragtol) + value2 = str(file2enzyme[key1]) + file2label[key1] + str(file2mods[key1]) + str(file2pctol) + str(file2fragtol) + datanalysisparams = dict() + for p in file2params.keys(): + if (len(file2params[p]) > 0): + datanalysisparams[p] = file2params[p][key1] + if tag == 0 and tmp == []: int_node = doc.createElement('int') int_node.appendChild(doc.createTextNode('0')) tmp.append({instr_val: value2}) + parameterGroup['0'] = {'instrument': file2instrument[key1], 'label': file2label[key1] , 'mods': file2mods[key1], 'enzyme': file2enzyme[key1], 'pctol': file2pctol[key1], + 'fragtol': file2fragtol[key1],'pctolunit': file2pctolunit[key1], 'fragtolunit': file2fragtolunit[key1], 'datanalysisparams': datanalysisparams } if 'Lys8' in file2label[key1] or 'Arg10' in file2label[key1] or 'Arg6' in \ - file2label[key1] or 'Lys6' in file2label[key1]: - parameterGroup['0'] = [file2instrument[key1], file2label[key1], file2mods[key1], file2enzyme[key1], - file2silac_shape[key1], file2pctol[key1], file2fragtol[key1], - file2pctolunit[key1], file2fragtolunit[key1]] - else: - parameterGroup['0'] = [file2instrument[key1], file2label[key1], file2mods[key1], file2enzyme[key1], - file2pctol[key1], file2fragtol[key1], file2pctolunit[key1], - [file2fragtolunit[key1]]] + file2label[key1] or 'Lys6' in file2label[key1]: + parameterGroup['0']['silac_shape'] = file2silac_shape[key1] elif {instr_val: value2} in tmp: int_node = doc.createElement('int') @@ -1564,16 +1658,13 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns int_text = doc.createTextNode(str(tag)) int_node.appendChild(int_text) tmp.append({instr_val: value2}) + parameterGroup[str(tag)] = {'instrument': file2instrument[key1], 'label': file2label[key1] , 'mods': file2mods[key1], 'enzyme': file2enzyme[key1], + 'pctol': file2pctol[key1], 'fragtol': file2fragtol[key1],'pctolunit': file2pctolunit[key1], 'fragtolunit': file2fragtolunit[key1], 'datparams': datparams} + if 'Lys8' in file2label[key1] or 'Arg10' in file2label[key1] or 'Arg6' in file2label[key1] \ or 'Lys6' in file2label[key1]: - parameterGroup[str(tag)] = [file2instrument[key1], file2label[key1], file2mods[key1], - file2enzyme[key1], file2silac_shape[key1], - file2pctol[key1], file2fragtol[key1], file2pctolunit[key1], - file2fragtolunit[key1]] - else: - parameterGroup[str(tag)] = [file2instrument[key1], file2label[key1], file2mods[key1], - file2enzyme[key1], file2pctol[key1], file2fragtol[key1], - file2pctolunit[key1], file2fragtolunit[key1]] + parameterGroup[str(tag)]['silac_shape'] = file2silac_shape[key1] + paramGroupIndices.appendChild(int_node) @@ -1600,7 +1691,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns for i, j in parameterGroup.items(): parameterGroup = doc.createElement('parameterGroup') msInstrument = doc.createElement('msInstrument') - if 'Bruker Q-TOF' == j[0]: + if 'Bruker Q-TOF' == j['instrument']: msInstrument.appendChild(doc.createTextNode('1')) maxCharge = doc.createElement('maxCharge') maxCharge.appendChild(doc.createTextNode('5')) @@ -1624,7 +1715,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns advancedPeakSplitting.appendChild(doc.createTextNode('True')) intensityThreshold = doc.createElement('intensityThreshold') intensityThreshold.appendChild(doc.createTextNode('30')) - elif 'AB Sciex Q-TOF' == j[0]: + elif 'AB Sciex Q-TOF' == j["instrument"]: msInstrument.appendChild(doc.createTextNode('2')) maxCharge = doc.createElement('maxCharge') maxCharge.appendChild(doc.createTextNode('5')) @@ -1648,7 +1739,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns advancedPeakSplitting.appendChild(doc.createTextNode('True')) intensityThreshold = doc.createElement('intensityThreshold') intensityThreshold.appendChild(doc.createTextNode('0')) - elif 'Agilent Q-TOF' == j[0]: + elif 'Agilent Q-TOF' == j["instrument"]: msInstrument.appendChild(doc.createTextNode('3')) maxCharge = doc.createElement('maxCharge') maxCharge.appendChild(doc.createTextNode('5')) @@ -1672,7 +1763,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns advancedPeakSplitting.appendChild(doc.createTextNode('True')) intensityThreshold = doc.createElement('intensityThreshold') intensityThreshold.appendChild(doc.createTextNode('0')) - elif 'Bruker TIMS' == j[0]: + elif 'Bruker TIMS' == j["instrument"]: msInstrument.appendChild(doc.createTextNode('4')) maxCharge = doc.createElement('maxCharge') maxCharge.appendChild(doc.createTextNode('4')) @@ -1738,9 +1829,9 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns isotopeValleyFactor = doc.createElement('isotopeValleyFactor') isotopeValleyFactor.appendChild(doc.createTextNode('1.2')) labelMods = doc.createElement('labelMods') - if 'Lys8' in j[1] or 'Arg10' in j[1] or 'Arg6' in j[1] or 'Lys6' in j[1]: - for lm in range(j[4][0]): - r = j[1].split(',')[lm * j[4][1]:lm * j[4][1] + lm * j[4][1]] + if 'Lys8' in j["label"] or 'Arg10' in j["label"] or 'Arg6' in j["label"] or 'Lys6' in j["label"]: + for lm in range(j["silac_shape"][0]): + r = j["label"].split(',')[lm * j["silac_shape"][1]:lm * j["silac_shape"][1] + lm * j["silac_shape"][1]] if 'Arg0' in r: r.remove('Arg0') text = ';'.join(r) @@ -1753,7 +1844,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns string.appendChild(doc.createTextNode('')) labelMods.appendChild(string) lcmsRunType = doc.createElement('lcmsRunType') - if "TMT" in j[1] or 'iTRAQ' in j[1]: + if "TMT" in j["label"] or 'iTRAQ' in j["label"]: lcmsRunType.appendChild(doc.createTextNode('Reporter ion MS2')) else: lcmsRunType.appendChild(doc.createTextNode('Standard')) @@ -1762,7 +1853,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns # create label subnode lfqMode = doc.createElement('lfqMode') - if j[1] == 'label free sample': + if j["label"] == 'label free sample': lfqMode.appendChild(doc.createTextNode('1')) else: lfqMode.appendChild(doc.createTextNode('0')) @@ -1791,14 +1882,26 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns lfqMinRatioCount.appendChild(doc.createTextNode('2')) maxLabeledAa = doc.createElement('maxLabeledAa') multiplicity = doc.createElement('multiplicity') - if 'Lys8' in j[1] or 'Arg10' in j[1] or 'Arg6' in j[1] or 'Lys6' in j[1]: + if 'Lys8' in j["label"] or 'Arg10' in j["label"] or 'Arg6' in j["label"] or 'Lys6' in j["label"]: maxLabeledAa.appendChild(doc.createTextNode('3')) - multiplicity.appendChild(doc.createTextNode(str(j[4][0]))) + multiplicity.appendChild(doc.createTextNode(str(j["silac_shape"][0]))) else: maxLabeledAa.appendChild(doc.createTextNode('0')) multiplicity.appendChild(doc.createTextNode('1')) + + + maxNmods = doc.createElement('maxNmods') - maxNmods.appendChild(doc.createTextNode('5')) + if ("max_mods" in datanalysisparams): + print(len(set(datanalysisparams["max_mods"])) > 1) + first = datanalysisparams["max_mods"] + maxNmods.appendChild(doc.createTextNode(first)) + + else: + maxNmods.appendChild(doc.createTextNode('5')) + + + maxMissedCleavages = doc.createElement('maxMissedCleavages') maxMissedCleavages.appendChild(doc.createTextNode('2')) enzymeMode = doc.createElement('enzymeMode') @@ -1815,8 +1918,8 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns variableModifications = doc.createElement('variableModifications') fixedM_list = [] Variable_list = [] - fixedM_list.extend(j[2][0].split(',')) - Variable_list.extend(j[2][1].split(',')) + fixedM_list.extend(j["mods"][0].split(',')) + Variable_list.extend(j["mods"][1].split(',')) fixedM_list = list(set(fixedM_list)) Variable_list = list(set(Variable_list)) for F in fixedM_list: @@ -1832,9 +1935,9 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns # create enzymes subnode enzymes_node = doc.createElement('enzymes') - for index in range(len(j[3])): + for index in range(len(j["enzyme"])): string = doc.createElement('string') - string.appendChild(doc.createTextNode(j[3][index])) + string.appendChild(doc.createTextNode(j["enzyme"][index])) enzymes_node.appendChild(string) enzymesFirstSearch = doc.createElement('enzymesFirstSearch') enzymesFirstSearch.appendChild(doc.createTextNode('')) @@ -1851,8 +1954,8 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns multiModifications = doc.createElement('multiModifications') multiModifications.appendChild(doc.createTextNode('')) isobaricLabels = doc.createElement('isobaricLabels') - if "TMT" in j[1]: - for t in j[1].split(','): + if "TMT" in j["label"]: + for t in j["label"].split(','): IsobaricLabelInfo = doc.createElement('IsobaricLabelInfo') internalLabel = doc.createElement('internalLabel') internalLabel.appendChild(doc.createTextNode(t)) @@ -1876,8 +1979,8 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns IsobaricLabelInfo.appendChild(correctionFactorP2) IsobaricLabelInfo.appendChild(tmtLike) isobaricLabels.appendChild(IsobaricLabelInfo) - elif "iTRAQ" in j[1]: - for t in j[1].split(','): + elif "iTRAQ" in j["label"]: + for t in j["label"].split(','): IsobaricLabelInfo = doc.createElement('IsobaricLabelInfo') internalLabel = doc.createElement('internalLabel') internalLabel.appendChild(doc.createTextNode(t)) @@ -1918,25 +2021,16 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns doMassFiltering.appendChild(doc.createTextNode('True')) firstSearchTol = doc.createElement('firstSearchTol') mainSearchTol = doc.createElement('mainSearchTol') - if len(j) == 8: - firstSearchTol.appendChild(doc.createTextNode(str(j[5]))) - mainSearchTol.appendChild(doc.createTextNode(str(j[4]))) - if j[6] == 'ppm': + firstSearchTol.appendChild(doc.createTextNode(str(j["fragtol"]))) + mainSearchTol.appendChild(doc.createTextNode(str(j["pctol"]))) + if j["pctolunit"] == 'ppm': searchTolInPpm = doc.createElement('searchTolInPpm') searchTolInPpm.appendChild(doc.createTextNode('True')) - else: - searchTolInPpm = doc.createElement('searchTolInPpm') - searchTolInPpm.appendChild(doc.createTextNode('False')) else: - firstSearchTol.appendChild(doc.createTextNode(str(j[6]))) - mainSearchTol.appendChild(doc.createTextNode(str(j[5]))) - if j[7] == 'ppm': - searchTolInPpm = doc.createElement('searchTolInPpm') - searchTolInPpm.appendChild(doc.createTextNode('True')) - else: searchTolInPpm = doc.createElement('searchTolInPpm') searchTolInPpm.appendChild(doc.createTextNode('False')) isotopeMatchTol = doc.createElement('isotopeMatchTol') + isotopeMatchTol.appendChild(doc.createTextNode('2')) isotopeMatchTolInPpm = doc.createElement('isotopeMatchTolInPpm') isotopeMatchTolInPpm.appendChild(doc.createTextNode('True')) @@ -1967,7 +2061,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns filterPif = doc.createElement('filterPif') reporterFraction = doc.createElement('reporterFraction') reporterBasePeakRatio = doc.createElement('reporterBasePeakRatio') - if "TMT" in j[1]: + if "TMT" in j["label"]: reporterMassTolerance.appendChild(doc.createTextNode('0.003')) reporterPif.appendChild(doc.createTextNode('0')) filterPif.appendChild(doc.createTextNode('False')) diff --git a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py new file mode 100644 index 0000000..f3252f6 --- /dev/null +++ b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py @@ -0,0 +1,197 @@ +import pandas as pd +import re +import yaml +import os.path +from sdrf_pipelines.zooma.zooma import OlsClient +from sdrf_pipelines.openms.unimod import UnimodDatabase +from sdrf_pipelines.sdrf.sdrf import SdrfDataFrame + +# Accessing ontologies and CVs +unimod = UnimodDatabase() +olsclient = OlsClient() +# print(ols_out) + +field_types = {"boolean": bool, "str": str, "integer": int, "float": (float, int)} + + +# Function for consistency checks +def verify_content(pname, pvalue, ptype): + # for each type: check consistency + # print(type(pvalue)) + if ptype in field_types.keys(): + if not isinstance(pvalue, field_types[ptype]): + exit("ERROR: " + pname + " needs to be " + ptype + "!!") +# if ptype == "boolean": +# if not isinstance(pvalue, bool): +# exit("ERROR: " + pname + " needs to be either \"true\" or \"false\"!!") +# elif ptype == "str": +# if not isinstance(pvalue, str): +# exit("ERROR: " + pname + " needs to be a string!!") +# elif ptype == "integer": +# if not isinstance(pvalue, int): +# exit("ERROR: " + pname + " needs to be a string!!") +# elif ptype == "float": +# if not isinstance(pvalue, (float, int)): +# exit("ERROR: " + pname + " needs to be a numeric value!!") + elif ptype == "class": + not_matching = [x for x in pvalue.split(",") if x not in p["value"]] + if not_matching != []: + exit("ERROR: " + pname + " needs to have one of these values: " + ' '.join(p["value"]) + "!!\n" + + ' '.join(not_matching) + " did not match") + + # Mass tolerances: do they include Da or ppm exclusively? + if pname == "fragment_mass_tolerance" or pname == "precursor_mass_tolerance": + unit = pvalue.split(" ")[1] + if unit != "Da" and unit != "ppm": + exit("ERROR: " + pname + " allows only units of \"Da\" and \"ppm\", separated by space from the \ +value!!\nWe found " + unit) + # ENZYME AND MODIFICATIONS: LOOK UP ONTOLOGY VALUES + elif pname == "enzyme": + ols_out = olsclient.search(pvalue, ontology="MS", exact=True) + if ols_out is None: + exit("ERROR: enzyme " + pvalue + " not found in the MS ontology, see \ +https://bioportal.bioontology.org/ontologies/MS/?p=classes&conceptid=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001045 \ +for available terms") + pvalue = "NT=" + pvalue + ";AC=" + ols_out[0]["short_form"] + return pvalue + + +def new_or_default(params_in, pname, p): + if(pname in list(params_in.keys())): + print("Found in parameter file") + pvalue = params_in[pname] + else: + print("Setting to default: " + p["default"]) + pvalue = p["default"] + return(pvalue) + + +# Function to load modifications +def add_ptms(mods, pname, mod_columns): + for m in mods: + tmod = m.split(" of ") + if len(tmod) < 2: + exit("ERROR: Something wrong with the modification entry " + m + ". It should be PSI_MS_NAME of RESIDUE. \ +Note that it should be single residues") + modname = tmod[0] + modpos = tmod[1] + found = [x for x in unimod.modifications if modname == x.get_name()] + if found == []: + exit("ERROR: " + m + " not found in Unimod. Check the \"PSI-MS Names\" in unimod.org. Also check whether you \ +used space between the comma separated modifications") + modtype = pname.replace("_mods", "") + if re.match("[A-Z]", modpos): + mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" +\ + modtype + ";TA=" + modpos + elif modpos in ["Protein N-term", "Protein C-term", "Any N-term", "Any C-term"]: + mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" +\ + modtype + ";PP=" + modpos + else: + exit("ERROR: Wrong residue given: " + modpos + ". Should be either one upper case letter or any of \"Protein N-term\", \ + \"Protein C-term\", \"Any N-term\", \"Any C-term\"") + return mod_columns + + +# modifications have the same column name, not working with pandas +# therefore separated +mod_columns = pd.DataFrame() + +# For summary at the end +overwritten = set() + +with open(r'param2sdrf.yml') as file: + param_mapping = yaml.safe_load(file) + mapping = param_mapping["parameters"] + + +# READ PARAMETERS FOR RUNNING WORKFLOW +with open(r'params.yml') as file: + tparams_in = yaml.safe_load(file) + params_in = tparams_in["params"] + rawfiles = tparams_in["rawfiles"] + fastafile = tparams_in["fastafile"] + +# WE NEED AN SDRF FILE FOR THE EXPERIMENTAL DESIGN, CONTAINING FILE LOCATIONS +sdrf_content = pd.DataFrame() +has_sdrf = os.path.isfile("./sdrf.tsv") +if has_sdrf: + sdrf_content = pd.read_csv("sdrf.tsv", sep="\t") + mod_columns = sdrf_content.filter(like="comment[modification parameters]") + sdrf_content = sdrf_content.drop(columns=mod_columns.columns) + sdrf_content["comment[modification parameters]"] = None + # delete columns with fixed/variable modification info + if "fixed_mods" in params_in.keys(): + ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=fixed"))] + mod_columns.drop(ttt, axis=1, inplace=True) + overwritten.add("fixed_mods") + if "variable_mods" in params_in.keys(): + ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=variable"))] + mod_columns.drop(ttt, axis=1, inplace=True) + overwritten.add("variable_mods") +else: + # THROW ERROR FOR MISSING SDRF + exit("ERROR: No SDRF file given. Add an at least minimal version\nFor more details, \ +see https://github.com/bigbio/proteomics-metadata-standard/tree/master/sdrf-proteomics") + + +# FIRST STANDARD PARAMETERS +# FOR GIVEN PARAMETERS +# CHECK WHETHER COLUMN IN SDRF TO PUT WARNING AND OVERWRITE +# IF NOT GIVEN, WRITE COLUMN +for p in mapping: + pname = p["name"] + ptype = p["type"] + print("---- Parameter: " + pname + ": ----") + + pvalue = new_or_default(params_in, pname, p) + + psdrf = "comment[" + p["sdrf"] + "]" + if psdrf in sdrf_content.keys(): + if (len(set(sdrf_content[psdrf])) > 1): + exit("ERROR: multiple values for parameter " + pname + " in sdrf file\n We recommend separating \ +the file into parts with the same data analysis parameters") + + pvalue = verify_content(pname, pvalue, ptype) + + # Modifications: look up in Unimod + if pname in ["fixed_mods", "variable_mods"] and pname in overwritten: + mods = pvalue.split(",") + print("WARNING: Overwriting " + pname + " values in sdrf file with " + pvalue) + mod_columns = add_ptms(mods, pname, mod_columns) + + # Now finally writing the value + elif pname not in ["fixed_mods", "variable_mods"]: + print("WARNING: Overwriting " + pname + " values in sdrf file with " + pvalue) + overwritten.add(pname) + sdrf_content[psdrf] = pvalue + + else: + sdrf_content[psdrf] = pvalue + +# OVERWRITE RAW FILES IF GIVEN TO DIRECT TO THE CORRECT LOCATION? + +# ADD FASTA FILE TO SDRF (COMMENT:FASTA DATABASE FILE)? + +# WRITE EXPERIMENTAL DESIGN IF NO SDRF? + +# adding modification columns +colnames = list(sdrf_content.columns) + ["comment[modification parameters]"] * len(mod_columns.columns) + +sdrf_content = pd.concat([sdrf_content, mod_columns], axis=1) +sdrf_content.columns = colnames + +sdrf_content.dropna(how='all', axis=1, inplace=True) + +print("--- Writing sdrf file into sdrf_local.tsv ---") +# sdrf_content.to_csv("sdrf_local.tsv", sep="\t", header=colnames, index=False) +sdrf_content.to_csv("sdrf_local.tsv", sep="\t") + +# Verify with sdrf-parser +check_sdrf = SdrfDataFrame() +check_sdrf.parse("sdrf_local.tsv") +check_sdrf.validate("mass_spectrometry") + +print("########## SUMMARY #########") +print("--- The following parameters have been overwritten in the sdrf file: ---") +for p in overwritten: + print(p) diff --git a/sdrf_pipelines/sdrf_merge/param2sdrf.yml b/sdrf_pipelines/sdrf_merge/param2sdrf.yml new file mode 100644 index 0000000..07e6e97 --- /dev/null +++ b/sdrf_pipelines/sdrf_merge/param2sdrf.yml @@ -0,0 +1,223 @@ +name: param2sdrf + +description: Mapping of parameters to run workflows with sdrf headers, including default values + +parameters: +- name: fixed_mods + type: ontology + sdrf: modification parameters + default: NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 + comment: only as unimod names with residue(s) separate via comma + +- name: variable_mods + type: ontology + sdrf: modification parameters + default: NT=oxidation;MT=variable;TA=M;AC=UNIMOD:35 + comment: only as unimod names with residue(s) + +- name: precursor_mass_tolerance + type: string + sdrf: precursor mass tolerance + default: 30 ppm + +- name: fragment_mass_tolerance + type: string + sdrf: fragment mass tolerance + default: 0.05 Da + +- name: enzyme + type: ontology + sdrf: cleavage agent details + default: Trypsin + +#- name: type_cleavage +# type: class +# sdrf: type of cleavage +# default: + +- name: fions + type: class + sdrf: forward ions + default: b + value: + - a + - a1 + - b + - c + - c1 + - a1 + +- name: rions + type: class + sdrf: reverse ions + default: y + value: + - x + - y + - z + - z1 + - z2 + +- name: isotope_error_range + type: integer + sdrf: isotope error range + default: 0 + +- name: add_decoys + type: boolean + sdrf: add decoys + default: true + +- name: num_hits + type: integer + sdrf: num peptide hits + default: 1 + +- name: miscleavages + type: integer + sdrf: allowed miscleavages + default: 1 + +- name: min_precursor_charge + type: integer + sdrf: minimum precursor charge + default: 2 + +- name: max_precursor_charge + type: integer + sdrf: maximum precursor charge + default: 3 + +- name: min_peptide_length + type: integer + sdrf: minimum peptide length + default: 8 + +- name: max_peptide_length + type: integer + sdrf: maximum peptide length + default: 12 + +- name: max_mods + type: integer + sdrf: maximum allowed modification + default: 4 + +- name: fdr_psm + type: float + sdrf: fdr on psm level + default: 0.01 + +- name: fdr_peptide + type: float + sdrf: fdr on peptide level + default: 0.01 + +- name: fdr_protein + type: float + sdrf: fdr on protein level + default: 0.01 + +#- name: match_between_runs_rt_tol +#- type: float +#- sdrf: retention time tolerance match between runs +#- default: TODO + +#- name: match_between_runs_mz_tol +#- type: float +#- sdrf: m over z tolerance match between runs +#- default: TODO + +- name: match_between_runs + type: boolean + sdrf: run match between runs + default: true + +- name: protein_inference + type: class + sdrf: protein inference method + default: unique + value: + - unique + - shared + - parsimony + - other + +- name: quantification_method + type: class + sdrf: quantification method + default: intensity + value: + - precursor + - ms2 + - labelled + - other + +- name: summarization_proteins + type: class + sdrf: summarization of proteins method + default: sum_abs + value: + - sum_abs + - mean + - median + - other + comment: sum_abs is the only one on non-transformed data + +- name: min_num_peptides + type: integer + sdrf: minimum number of peptides per protein + default: 2 + +- name: summarization_psms + type: class + sdrf: summarization of psms method + default: sum_abs + value: + - sum_abs + - mean + - median + - other + + +- name: quant_transformation + type: class + sdrf: transformation of quantitative values + default: log + value: + - log + - rlog + - none + - other + +- name: normalization_method + type: class + sdrf: normalization method + default: median + value: + - mean + - median + - quantile + - linear + - non-linear + - other + +- name: run_statistics + type: boolean + sdrf: run statistical tests + default: true + +- name: fdr_method + type: class + sdrf: method for correction of multiple testing + default: benjamini-hochberg + value: + - benjamini-hochberg + - bonferroni + - qvalue + - other + +- name: fdr_threshold + type: float + sdrf: threshold for statistical test fdr + default: 0.01 diff --git a/setup.py b/setup.py index 8f98e65..d00269f 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ long_description_content_type="text/markdown", long_description=long_description, license="'Apache 2.0", - data_files=[("", ["LICENSE", "sdrf_pipelines/openms/unimod.xml"])], + data_files=[("", ["LICENSE", "sdrf_pipelines/openms/unimod.xml", "sdrf_pipelines/sdrf_merge/param2sdrf.yml"])], package_data={'': ['*.xml'], }, url="https://github.com/bigbio/sdrf-pipelines", packages=find_packages(),