From 2f99f734dff7bc327522252ed6ed06335b24da54 Mon Sep 17 00:00:00 2001 From: veitveit Date: Thu, 27 Jan 2022 18:27:59 +0100 Subject: [PATCH 1/8] added various data analysis parameters --- sdrf_pipelines/maxquant/maxquant.py | 218 +++++++++++++++------ sdrf_pipelines/sdrf_merge/param2sdrf.yml | 223 ++++++++++++++++++++++ sdrf_pipelines/sdrf_merge/params2sdrf.yml | 223 ++++++++++++++++++++++ sdrf_pipelines/sdrf_merge/sdrf-mapper.py | 174 +++++++++++++++++ setup.py | 2 +- 5 files changed, 778 insertions(+), 62 deletions(-) create mode 100644 sdrf_pipelines/sdrf_merge/param2sdrf.yml create mode 100644 sdrf_pipelines/sdrf_merge/params2sdrf.yml create mode 100644 sdrf_pipelines/sdrf_merge/sdrf-mapper.py diff --git a/sdrf_pipelines/maxquant/maxquant.py b/sdrf_pipelines/maxquant/maxquant.py index df7f0cb..5852ae4 100644 --- a/sdrf_pipelines/maxquant/maxquant.py +++ b/sdrf_pipelines/maxquant/maxquant.py @@ -11,6 +11,7 @@ from xml.dom.minidom import parse from xml.dom.minidom import Document from datetime import datetime +import yaml import os import numpy as np import pkg_resources @@ -22,6 +23,8 @@ def __init__(self) -> None: super().__init__() self.warnings = dict() self.modfile = pkg_resources.resource_filename(__name__, "modifications.xml") + self.datparamfile = pkg_resources.resource_filename(__name__, "param2sdrf.yml") + print(self.datparamfile) def create_new_mods(self, mods, mqconfdir): i = 0 @@ -645,6 +648,14 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns sdrf = sdrf.astype(str) sdrf.columns = map(str.lower, sdrf.columns) # convert column names to lower-case + with open(self.datparamfile) as file: + param_mapping = yaml.load(file, Loader=yaml.FullLoader) + mapping = param_mapping["parameters"] + datparams = dict() + for i in mapping: + datparams[i["sdrf"]] = i["name"] + + # map filename to tuple of [fixed, variable] mods mod_cols = [c for ind, c in enumerate(sdrf) if c.startswith('comment[modification parameters')] # columns with modification parameters @@ -670,6 +681,11 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns source_name2n_reps = dict() file2technical_rep = dict() file2instrument = dict() + # New parameters from extended SDRF (including data analysis parameters) + file2params = dict() + for p in datparams.values(): + file2params[p] = dict() + if mqconfdir: self.create_new_mods(sdrf[mod_cols], mqconfdir) @@ -760,6 +776,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns file2fragtol[raw] = "20" file2fragtolunit[raw] = "ppm" + # STILL NOT IMPLEMENTED! if 'comment[dissociation method]' in row: if row['comment[dissociation method]'] == 'not available': file2diss[raw] = 'HCD' @@ -780,6 +797,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns else: file2technical_rep[raw] = "1" + # store highest replicate number for this source name if source_name in source_name2n_reps: source_name2n_reps[source_name] = max(int(source_name2n_reps[source_name]), @@ -999,6 +1017,15 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns file2label[raw] = lt + # Reading data analysis parameters + for p in datparams.keys(): + comment_p = 'comment[' + p + ']' + if comment_p in row: + file2params[datparams[p]][raw] = row[comment_p] + + + + # create maxquant parameters xml file doc = Document() @@ -1144,7 +1171,14 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns root.appendChild(secondPeptide) matchBetweenRuns_node = doc.createElement('matchBetweenRuns') - matchBetweenRuns_node.appendChild(doc.createTextNode(matchBetweenRuns)) + if (len(file2params["match_between_runs"]) > 0): + first = list(file2params["match_between_runs"].values())[0] + matchBetweenRuns_node.appendChild(doc.createTextNode(first)) + if(len(set(file2params["match_between_runs"].values())) > 1): + warning_message = "multiple values for match between runs, taking the first: " + first + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + else: + matchBetweenRuns_node.appendChild(doc.createTextNode(matchBetweenRuns)) root.appendChild(matchBetweenRuns_node) matchUnidentifiedFeatures = doc.createElement('matchUnidentifiedFeatures') @@ -1259,23 +1293,62 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns root.appendChild(intensityPredictionsFile) minPepLen = doc.createElement('minPepLen') - minPepLen.appendChild(doc.createTextNode('7')) + tparam = file2params["min_peptide_length"] + if (len(tparam) > 0): + first = list(tparam.values())[0] + minPepLen.appendChild(doc.createTextNode(first)) + if(len(set(tparam.values())) > 1): + warning_message = "multiple values for parameter minimum peptide length, taking the first: " + first + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + else: + minPepLen.appendChild(doc.createTextNode('7')) root.appendChild(minPepLen) + psmFdrCrosslink = doc.createElement('psmFdrCrosslink') psmFdrCrosslink.appendChild(doc.createTextNode('0.01')) root.appendChild(psmFdrCrosslink) peptideFdr = doc.createElement('peptideFdr') - peptideFdr.appendChild(doc.createTextNode(str(peptideFDR))) + tparam = file2params["fdr_peptide"] + if (len(tparam) > 0): + first = list(tparam.values())[0] + warning_message = "overwriting pepptide FDR using the value in the sdrf file" + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + peptideFdr.appendChild(doc.createTextNode(first)) + if(len(set(tparam.values())) > 1): + warning_message = "multiple values for parameter Peptide FDR, taking the first: " + first + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + else: + peptideFdr.appendChild(doc.createTextNode(str(peptideFDR))) root.appendChild(peptideFdr) proteinFdr = doc.createElement('proteinFdr') - proteinFdr.appendChild(doc.createTextNode(str(proteinFDR))) + tparam = file2params["fdr_protein"] + if (len(tparam) > 0): + first = list(tparam.values())[0] + warning_message = "overwriting protein FDR using the value in the sdrf file" + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + proteinFdr.appendChild(doc.createTextNode(first)) + if(len(set(tparam.values())) > 1): + warning_message = "multiple values for parameter Protein FDR, taking the first: " + first + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + else: + proteinFdr.appendChild(doc.createTextNode(str(proteinFDR))) root.appendChild(proteinFdr) siteFdr = doc.createElement('siteFdr') - siteFdr.appendChild(doc.createTextNode('0.01')) + tparam = file2params["fdr_psm"] + if (len(tparam) > 0): + first = list(tparam.values())[0] + warning_message = "overwriting PSM FDR using the value in the sdrf file" + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + siteFdr.appendChild(doc.createTextNode(first)) + if(len(set(tparam.values())) > 1): + warning_message = "multiple values for parameter PSM FDR, taking the first: " + first + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + else: + siteFdr.appendChild(doc.createTextNode('0.01')) root.appendChild(siteFdr) minPeptideLengthForUnspecificSearch = doc.createElement('minPeptideLengthForUnspecificSearch') @@ -1291,7 +1364,15 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns root.appendChild(useNormRatiosForOccupancy) minPeptides = doc.createElement('minPeptides') - minPeptides.appendChild(doc.createTextNode('1')) + tparam = file2params["min_num_peptides"] + if (len(tparam) > 0): + first = list(tparam.values())[0] + minPeptides.appendChild(doc.createTextNode(first)) + if(len(set(tparam.values())) > 1): + warning_message = "multiple values for parameter minimum number of peptides, taking the first: " + first + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + else: + minPeptides.appendChild(doc.createTextNode('1')) root.appendChild(minPeptides) minRazorPeptides = doc.createElement('minRazorPeptides') @@ -1362,8 +1443,23 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns compositionPrediction.appendChild(doc.createTextNode('0')) root.appendChild(compositionPrediction) + quantMode = doc.createElement('quantMode') - quantMode.appendChild(doc.createTextNode('1')) + tparam = file2params["quantification_method"] + if (len(tparam) > 0): + first = list(tparam.values())[0] + if first == "unique": + first = '2' + elif first == "shared": + first = '0' + else: + first = '1' + quantMode.appendChild(doc.createTextNode(first)) + if(len(set(tparam.values())) > 1): + warning_message = "multiple values for parameter Quantification mode, taking the first: " + first + self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 + else: + quantMode.appendChild(doc.createTextNode('1')) root.appendChild(quantMode) massDifferenceMods = doc.createElement('massDifferenceMods') @@ -1536,23 +1632,23 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns referenceChannel = doc.createElement('referenceChannel') for key1, instr_val in file2instrument.items(): - value2 = str(file2enzyme[key1]) + file2label[key1] + str(file2mods[key1]) + str(file2pctol) \ - + str(file2fragtol) - + value2 = str(file2enzyme[key1]) + file2label[key1] + str(file2mods[key1]) + str(file2pctol) + str(file2fragtol) + datanalysisparams = dict() + for p in file2params.keys(): + if (len(file2params[p]) > 0): + datanalysisparams[p] = file2params[p][key1] + + if tag == 0 and tmp == []: int_node = doc.createElement('int') int_node.appendChild(doc.createTextNode('0')) tmp.append({instr_val: value2}) + parameterGroup['0'] = {'instrument': file2instrument[key1], 'label': file2label[key1] , 'mods': file2mods[key1], 'enzyme': file2enzyme[key1], 'pctol': file2pctol[key1], + 'fragtol': file2fragtol[key1],'pctolunit': file2pctolunit[key1], 'fragtolunit': file2fragtolunit[key1], 'datanalysisparams': datanalysisparams } if 'Lys8' in file2label[key1] or 'Arg10' in file2label[key1] or 'Arg6' in \ - file2label[key1] or 'Lys6' in file2label[key1]: - parameterGroup['0'] = [file2instrument[key1], file2label[key1], file2mods[key1], file2enzyme[key1], - file2silac_shape[key1], file2pctol[key1], file2fragtol[key1], - file2pctolunit[key1], file2fragtolunit[key1]] - else: - parameterGroup['0'] = [file2instrument[key1], file2label[key1], file2mods[key1], file2enzyme[key1], - file2pctol[key1], file2fragtol[key1], file2pctolunit[key1], - [file2fragtolunit[key1]]] - + file2label[key1] or 'Lys6' in file2label[key1]: + parameterGroup['0']['silac_shape'] = file2silac_shape[key1] + elif {instr_val: value2} in tmp: int_node = doc.createElement('int') int_text = doc.createTextNode(str(tag)) @@ -1564,16 +1660,13 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns int_text = doc.createTextNode(str(tag)) int_node.appendChild(int_text) tmp.append({instr_val: value2}) + parameterGroup[str(tag)] = {'instrument': file2instrument[key1], 'label': file2label[key1] , 'mods': file2mods[key1], 'enzyme': file2enzyme[key1], + 'pctol': file2pctol[key1], 'fragtol': file2fragtol[key1],'pctolunit': file2pctolunit[key1], 'fragtolunit': file2fragtolunit[key1], 'datparams': datparams} + if 'Lys8' in file2label[key1] or 'Arg10' in file2label[key1] or 'Arg6' in file2label[key1] \ or 'Lys6' in file2label[key1]: - parameterGroup[str(tag)] = [file2instrument[key1], file2label[key1], file2mods[key1], - file2enzyme[key1], file2silac_shape[key1], - file2pctol[key1], file2fragtol[key1], file2pctolunit[key1], - file2fragtolunit[key1]] - else: - parameterGroup[str(tag)] = [file2instrument[key1], file2label[key1], file2mods[key1], - file2enzyme[key1], file2pctol[key1], file2fragtol[key1], - file2pctolunit[key1], file2fragtolunit[key1]] + parameterGroup[str(tag)]['silac_shape'] = file2silac_shape[key1] + paramGroupIndices.appendChild(int_node) @@ -1600,7 +1693,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns for i, j in parameterGroup.items(): parameterGroup = doc.createElement('parameterGroup') msInstrument = doc.createElement('msInstrument') - if 'Bruker Q-TOF' == j[0]: + if 'Bruker Q-TOF' == j['instrument']: msInstrument.appendChild(doc.createTextNode('1')) maxCharge = doc.createElement('maxCharge') maxCharge.appendChild(doc.createTextNode('5')) @@ -1624,7 +1717,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns advancedPeakSplitting.appendChild(doc.createTextNode('True')) intensityThreshold = doc.createElement('intensityThreshold') intensityThreshold.appendChild(doc.createTextNode('30')) - elif 'AB Sciex Q-TOF' == j[0]: + elif 'AB Sciex Q-TOF' == j["instrument"]: msInstrument.appendChild(doc.createTextNode('2')) maxCharge = doc.createElement('maxCharge') maxCharge.appendChild(doc.createTextNode('5')) @@ -1648,7 +1741,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns advancedPeakSplitting.appendChild(doc.createTextNode('True')) intensityThreshold = doc.createElement('intensityThreshold') intensityThreshold.appendChild(doc.createTextNode('0')) - elif 'Agilent Q-TOF' == j[0]: + elif 'Agilent Q-TOF' == j["instrument"]: msInstrument.appendChild(doc.createTextNode('3')) maxCharge = doc.createElement('maxCharge') maxCharge.appendChild(doc.createTextNode('5')) @@ -1672,7 +1765,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns advancedPeakSplitting.appendChild(doc.createTextNode('True')) intensityThreshold = doc.createElement('intensityThreshold') intensityThreshold.appendChild(doc.createTextNode('0')) - elif 'Bruker TIMS' == j[0]: + elif 'Bruker TIMS' == j["instrument"]: msInstrument.appendChild(doc.createTextNode('4')) maxCharge = doc.createElement('maxCharge') maxCharge.appendChild(doc.createTextNode('4')) @@ -1738,9 +1831,9 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns isotopeValleyFactor = doc.createElement('isotopeValleyFactor') isotopeValleyFactor.appendChild(doc.createTextNode('1.2')) labelMods = doc.createElement('labelMods') - if 'Lys8' in j[1] or 'Arg10' in j[1] or 'Arg6' in j[1] or 'Lys6' in j[1]: - for lm in range(j[4][0]): - r = j[1].split(',')[lm * j[4][1]:lm * j[4][1] + lm * j[4][1]] + if 'Lys8' in j["label"] or 'Arg10' in j["label"] or 'Arg6' in j["label"] or 'Lys6' in j["label"]: + for lm in range(j["silac_shape"][0]): + r = j["label"].split(',')[lm * j["silac_shape"][1]:lm * j["silac_shape"][1] + lm * j["silac_shape"][1]] if 'Arg0' in r: r.remove('Arg0') text = ';'.join(r) @@ -1753,7 +1846,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns string.appendChild(doc.createTextNode('')) labelMods.appendChild(string) lcmsRunType = doc.createElement('lcmsRunType') - if "TMT" in j[1] or 'iTRAQ' in j[1]: + if "TMT" in j["label"] or 'iTRAQ' in j["label"]: lcmsRunType.appendChild(doc.createTextNode('Reporter ion MS2')) else: lcmsRunType.appendChild(doc.createTextNode('Standard')) @@ -1762,7 +1855,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns # create label subnode lfqMode = doc.createElement('lfqMode') - if j[1] == 'label free sample': + if j["label"] == 'label free sample': lfqMode.appendChild(doc.createTextNode('1')) else: lfqMode.appendChild(doc.createTextNode('0')) @@ -1791,14 +1884,26 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns lfqMinRatioCount.appendChild(doc.createTextNode('2')) maxLabeledAa = doc.createElement('maxLabeledAa') multiplicity = doc.createElement('multiplicity') - if 'Lys8' in j[1] or 'Arg10' in j[1] or 'Arg6' in j[1] or 'Lys6' in j[1]: + if 'Lys8' in j["label"] or 'Arg10' in j["label"] or 'Arg6' in j["label"] or 'Lys6' in j["label"]: maxLabeledAa.appendChild(doc.createTextNode('3')) - multiplicity.appendChild(doc.createTextNode(str(j[4][0]))) + multiplicity.appendChild(doc.createTextNode(str(j["silac_shape"][0]))) else: maxLabeledAa.appendChild(doc.createTextNode('0')) multiplicity.appendChild(doc.createTextNode('1')) + + + maxNmods = doc.createElement('maxNmods') - maxNmods.appendChild(doc.createTextNode('5')) + if ("max_mods" in datanalysisparams): + print(len(set(datanalysisparams["max_mods"])) > 1) + first = datanalysisparams["max_mods"] + maxNmods.appendChild(doc.createTextNode(first)) + + else: + maxNmods.appendChild(doc.createTextNode('5')) + + + maxMissedCleavages = doc.createElement('maxMissedCleavages') maxMissedCleavages.appendChild(doc.createTextNode('2')) enzymeMode = doc.createElement('enzymeMode') @@ -1815,8 +1920,8 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns variableModifications = doc.createElement('variableModifications') fixedM_list = [] Variable_list = [] - fixedM_list.extend(j[2][0].split(',')) - Variable_list.extend(j[2][1].split(',')) + fixedM_list.extend(j["mods"][0].split(',')) + Variable_list.extend(j["mods"][1].split(',')) fixedM_list = list(set(fixedM_list)) Variable_list = list(set(Variable_list)) for F in fixedM_list: @@ -1832,9 +1937,9 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns # create enzymes subnode enzymes_node = doc.createElement('enzymes') - for index in range(len(j[3])): + for index in range(len(j["enzyme"])): string = doc.createElement('string') - string.appendChild(doc.createTextNode(j[3][index])) + string.appendChild(doc.createTextNode(j["enzyme"][index])) enzymes_node.appendChild(string) enzymesFirstSearch = doc.createElement('enzymesFirstSearch') enzymesFirstSearch.appendChild(doc.createTextNode('')) @@ -1851,8 +1956,8 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns multiModifications = doc.createElement('multiModifications') multiModifications.appendChild(doc.createTextNode('')) isobaricLabels = doc.createElement('isobaricLabels') - if "TMT" in j[1]: - for t in j[1].split(','): + if "TMT" in j["label"]: + for t in j["label"].split(','): IsobaricLabelInfo = doc.createElement('IsobaricLabelInfo') internalLabel = doc.createElement('internalLabel') internalLabel.appendChild(doc.createTextNode(t)) @@ -1876,8 +1981,8 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns IsobaricLabelInfo.appendChild(correctionFactorP2) IsobaricLabelInfo.appendChild(tmtLike) isobaricLabels.appendChild(IsobaricLabelInfo) - elif "iTRAQ" in j[1]: - for t in j[1].split(','): + elif "iTRAQ" in j["label"]: + for t in j["label"].split(','): IsobaricLabelInfo = doc.createElement('IsobaricLabelInfo') internalLabel = doc.createElement('internalLabel') internalLabel.appendChild(doc.createTextNode(t)) @@ -1918,25 +2023,16 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns doMassFiltering.appendChild(doc.createTextNode('True')) firstSearchTol = doc.createElement('firstSearchTol') mainSearchTol = doc.createElement('mainSearchTol') - if len(j) == 8: - firstSearchTol.appendChild(doc.createTextNode(str(j[5]))) - mainSearchTol.appendChild(doc.createTextNode(str(j[4]))) - if j[6] == 'ppm': + firstSearchTol.appendChild(doc.createTextNode(str(j["fragtol"]))) + mainSearchTol.appendChild(doc.createTextNode(str(j["pctol"]))) + if j["pctolunit"] == 'ppm': searchTolInPpm = doc.createElement('searchTolInPpm') searchTolInPpm.appendChild(doc.createTextNode('True')) - else: - searchTolInPpm = doc.createElement('searchTolInPpm') - searchTolInPpm.appendChild(doc.createTextNode('False')) else: - firstSearchTol.appendChild(doc.createTextNode(str(j[6]))) - mainSearchTol.appendChild(doc.createTextNode(str(j[5]))) - if j[7] == 'ppm': - searchTolInPpm = doc.createElement('searchTolInPpm') - searchTolInPpm.appendChild(doc.createTextNode('True')) - else: searchTolInPpm = doc.createElement('searchTolInPpm') searchTolInPpm.appendChild(doc.createTextNode('False')) isotopeMatchTol = doc.createElement('isotopeMatchTol') + isotopeMatchTol.appendChild(doc.createTextNode('2')) isotopeMatchTolInPpm = doc.createElement('isotopeMatchTolInPpm') isotopeMatchTolInPpm.appendChild(doc.createTextNode('True')) @@ -1967,7 +2063,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns filterPif = doc.createElement('filterPif') reporterFraction = doc.createElement('reporterFraction') reporterBasePeakRatio = doc.createElement('reporterBasePeakRatio') - if "TMT" in j[1]: + if "TMT" in j["label"]: reporterMassTolerance.appendChild(doc.createTextNode('0.003')) reporterPif.appendChild(doc.createTextNode('0')) filterPif.appendChild(doc.createTextNode('False')) diff --git a/sdrf_pipelines/sdrf_merge/param2sdrf.yml b/sdrf_pipelines/sdrf_merge/param2sdrf.yml new file mode 100644 index 0000000..07e6e97 --- /dev/null +++ b/sdrf_pipelines/sdrf_merge/param2sdrf.yml @@ -0,0 +1,223 @@ +name: param2sdrf + +description: Mapping of parameters to run workflows with sdrf headers, including default values + +parameters: +- name: fixed_mods + type: ontology + sdrf: modification parameters + default: NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 + comment: only as unimod names with residue(s) separate via comma + +- name: variable_mods + type: ontology + sdrf: modification parameters + default: NT=oxidation;MT=variable;TA=M;AC=UNIMOD:35 + comment: only as unimod names with residue(s) + +- name: precursor_mass_tolerance + type: string + sdrf: precursor mass tolerance + default: 30 ppm + +- name: fragment_mass_tolerance + type: string + sdrf: fragment mass tolerance + default: 0.05 Da + +- name: enzyme + type: ontology + sdrf: cleavage agent details + default: Trypsin + +#- name: type_cleavage +# type: class +# sdrf: type of cleavage +# default: + +- name: fions + type: class + sdrf: forward ions + default: b + value: + - a + - a1 + - b + - c + - c1 + - a1 + +- name: rions + type: class + sdrf: reverse ions + default: y + value: + - x + - y + - z + - z1 + - z2 + +- name: isotope_error_range + type: integer + sdrf: isotope error range + default: 0 + +- name: add_decoys + type: boolean + sdrf: add decoys + default: true + +- name: num_hits + type: integer + sdrf: num peptide hits + default: 1 + +- name: miscleavages + type: integer + sdrf: allowed miscleavages + default: 1 + +- name: min_precursor_charge + type: integer + sdrf: minimum precursor charge + default: 2 + +- name: max_precursor_charge + type: integer + sdrf: maximum precursor charge + default: 3 + +- name: min_peptide_length + type: integer + sdrf: minimum peptide length + default: 8 + +- name: max_peptide_length + type: integer + sdrf: maximum peptide length + default: 12 + +- name: max_mods + type: integer + sdrf: maximum allowed modification + default: 4 + +- name: fdr_psm + type: float + sdrf: fdr on psm level + default: 0.01 + +- name: fdr_peptide + type: float + sdrf: fdr on peptide level + default: 0.01 + +- name: fdr_protein + type: float + sdrf: fdr on protein level + default: 0.01 + +#- name: match_between_runs_rt_tol +#- type: float +#- sdrf: retention time tolerance match between runs +#- default: TODO + +#- name: match_between_runs_mz_tol +#- type: float +#- sdrf: m over z tolerance match between runs +#- default: TODO + +- name: match_between_runs + type: boolean + sdrf: run match between runs + default: true + +- name: protein_inference + type: class + sdrf: protein inference method + default: unique + value: + - unique + - shared + - parsimony + - other + +- name: quantification_method + type: class + sdrf: quantification method + default: intensity + value: + - precursor + - ms2 + - labelled + - other + +- name: summarization_proteins + type: class + sdrf: summarization of proteins method + default: sum_abs + value: + - sum_abs + - mean + - median + - other + comment: sum_abs is the only one on non-transformed data + +- name: min_num_peptides + type: integer + sdrf: minimum number of peptides per protein + default: 2 + +- name: summarization_psms + type: class + sdrf: summarization of psms method + default: sum_abs + value: + - sum_abs + - mean + - median + - other + + +- name: quant_transformation + type: class + sdrf: transformation of quantitative values + default: log + value: + - log + - rlog + - none + - other + +- name: normalization_method + type: class + sdrf: normalization method + default: median + value: + - mean + - median + - quantile + - linear + - non-linear + - other + +- name: run_statistics + type: boolean + sdrf: run statistical tests + default: true + +- name: fdr_method + type: class + sdrf: method for correction of multiple testing + default: benjamini-hochberg + value: + - benjamini-hochberg + - bonferroni + - qvalue + - other + +- name: fdr_threshold + type: float + sdrf: threshold for statistical test fdr + default: 0.01 diff --git a/sdrf_pipelines/sdrf_merge/params2sdrf.yml b/sdrf_pipelines/sdrf_merge/params2sdrf.yml new file mode 100644 index 0000000..07e6e97 --- /dev/null +++ b/sdrf_pipelines/sdrf_merge/params2sdrf.yml @@ -0,0 +1,223 @@ +name: param2sdrf + +description: Mapping of parameters to run workflows with sdrf headers, including default values + +parameters: +- name: fixed_mods + type: ontology + sdrf: modification parameters + default: NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 + comment: only as unimod names with residue(s) separate via comma + +- name: variable_mods + type: ontology + sdrf: modification parameters + default: NT=oxidation;MT=variable;TA=M;AC=UNIMOD:35 + comment: only as unimod names with residue(s) + +- name: precursor_mass_tolerance + type: string + sdrf: precursor mass tolerance + default: 30 ppm + +- name: fragment_mass_tolerance + type: string + sdrf: fragment mass tolerance + default: 0.05 Da + +- name: enzyme + type: ontology + sdrf: cleavage agent details + default: Trypsin + +#- name: type_cleavage +# type: class +# sdrf: type of cleavage +# default: + +- name: fions + type: class + sdrf: forward ions + default: b + value: + - a + - a1 + - b + - c + - c1 + - a1 + +- name: rions + type: class + sdrf: reverse ions + default: y + value: + - x + - y + - z + - z1 + - z2 + +- name: isotope_error_range + type: integer + sdrf: isotope error range + default: 0 + +- name: add_decoys + type: boolean + sdrf: add decoys + default: true + +- name: num_hits + type: integer + sdrf: num peptide hits + default: 1 + +- name: miscleavages + type: integer + sdrf: allowed miscleavages + default: 1 + +- name: min_precursor_charge + type: integer + sdrf: minimum precursor charge + default: 2 + +- name: max_precursor_charge + type: integer + sdrf: maximum precursor charge + default: 3 + +- name: min_peptide_length + type: integer + sdrf: minimum peptide length + default: 8 + +- name: max_peptide_length + type: integer + sdrf: maximum peptide length + default: 12 + +- name: max_mods + type: integer + sdrf: maximum allowed modification + default: 4 + +- name: fdr_psm + type: float + sdrf: fdr on psm level + default: 0.01 + +- name: fdr_peptide + type: float + sdrf: fdr on peptide level + default: 0.01 + +- name: fdr_protein + type: float + sdrf: fdr on protein level + default: 0.01 + +#- name: match_between_runs_rt_tol +#- type: float +#- sdrf: retention time tolerance match between runs +#- default: TODO + +#- name: match_between_runs_mz_tol +#- type: float +#- sdrf: m over z tolerance match between runs +#- default: TODO + +- name: match_between_runs + type: boolean + sdrf: run match between runs + default: true + +- name: protein_inference + type: class + sdrf: protein inference method + default: unique + value: + - unique + - shared + - parsimony + - other + +- name: quantification_method + type: class + sdrf: quantification method + default: intensity + value: + - precursor + - ms2 + - labelled + - other + +- name: summarization_proteins + type: class + sdrf: summarization of proteins method + default: sum_abs + value: + - sum_abs + - mean + - median + - other + comment: sum_abs is the only one on non-transformed data + +- name: min_num_peptides + type: integer + sdrf: minimum number of peptides per protein + default: 2 + +- name: summarization_psms + type: class + sdrf: summarization of psms method + default: sum_abs + value: + - sum_abs + - mean + - median + - other + + +- name: quant_transformation + type: class + sdrf: transformation of quantitative values + default: log + value: + - log + - rlog + - none + - other + +- name: normalization_method + type: class + sdrf: normalization method + default: median + value: + - mean + - median + - quantile + - linear + - non-linear + - other + +- name: run_statistics + type: boolean + sdrf: run statistical tests + default: true + +- name: fdr_method + type: class + sdrf: method for correction of multiple testing + default: benjamini-hochberg + value: + - benjamini-hochberg + - bonferroni + - qvalue + - other + +- name: fdr_threshold + type: float + sdrf: threshold for statistical test fdr + default: 0.01 diff --git a/sdrf_pipelines/sdrf_merge/sdrf-mapper.py b/sdrf_pipelines/sdrf_merge/sdrf-mapper.py new file mode 100644 index 0000000..6a649a2 --- /dev/null +++ b/sdrf_pipelines/sdrf_merge/sdrf-mapper.py @@ -0,0 +1,174 @@ +import pandas as pd +import re +import yaml +import sdrf_pipelines +import os.path +from sdrf_pipelines.zooma.zooma import Zooma, OlsClient +from sdrf_pipelines.openms.unimod import UnimodDatabase +from sdrf_pipelines.sdrf.sdrf import SdrfDataFrame + +## Accessing ontologies and CVs +unimod = UnimodDatabase() +olsclient = OlsClient() +#print(ols_out) + +# modifications have the same column name, not working with pandas +# therefore separated +mod_columns = pd.DataFrame() + +## For summary at the end +overwritten = set() + +with open(r'param2sdrf.yml') as file: + param_mapping = yaml.load(file, Loader=yaml.FullLoader) + mapping = param_mapping["parameters"] + + +## READ PARAMETERS FOR RUNNING WORKFLOW +with open(r'params.yml') as file: + tparams_in = yaml.load(file, Loader=yaml.FullLoader) + params_in = tparams_in["params"] + rawfiles = tparams_in["rawfiles"] + fastafile = tparams_in["fastafile"] + + + +## WE NEED AN SDRF FILE FOR THE EXPERIMENTAL DESIGN, CONTAINING FILE LOCATIONS +sdrf_content = pd.DataFrame() +has_sdrf = os.path.isfile("./sdrf.tsv") +if has_sdrf : + sdrf_content = pd.read_csv("sdrf.tsv", sep="\t") + mod_columns = sdrf_content.filter(like="comment[modification parameters]") + sdrf_content = sdrf_content.drop(columns=mod_columns.columns) + # delete columns with fixed/variable modification info + if "fixed_mods" in params_in.keys(): + ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=fixed")) ] + mod_columns.drop(ttt, axis=1, inplace=True) + overwritten.add("fixed_mods") + if "variable_mods" in params_in.keys(): + ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=variable")) ] + mod_columns.drop(ttt, axis=1, inplace=True) + overwritten.add("variable_mods") + + +else: + ## THROW ERROR FOR MISSING SDRF + exit("ERROR: No SDRF file given. Add an at least minimal version\nFor more details, see https://github.com/bigbio/proteomics-metadata-standard/tree/master/sdrf-proteomics") + + +## FIRST STANDARD PARAMETERS +# FOR GIVEN PARAMETERS +# CHECK WHETHER COLUMN IN SDRF TO PUT WARNING AND OVERWRITE +# IF NOT GIVEN, WRITE COLUMN +for p in mapping: + pname = p["name"] + ptype = p["type"] + print("---- Parameter: " + pname + ": ----") + if(pname in list(params_in.keys())) : + print("Found in parameter file") + pvalue = params_in[pname] + else: + print("Setting to default: " + p["default"]) + pvalue = p["default"] + + psdrf = "comment[" + p["sdrf"] + "]" + + + + if (psdrf) in sdrf_content.keys() : + if (len(set(sdrf_content[psdrf])) > 1) : + exit("ERROR: multiple values for parameter " + pname + " in sdrf file\n We recommend separating the file into parts with the same data analysis parameters") + print("WARNING: Overwriting values in sdrf file with " + pvalue) + overwritten.add(pname) + + # for each type: check consistency + #print(type(pvalue)) + if ptype == "boolean" : + if not isinstance(pvalue, bool) : + exit("ERROR: " + pname + " needs to be either \"true\" or \"false\"!!") + if ptype == "str" : + if not isinstance(pvalue, str) : + exit("ERROR: " + pname + " needs to be a string!!") + if ptype == "integer" : + if not isinstance(pvalue, int) : + exit("ERROR: " + pname + " needs to be a string!!") + if ptype == "float" : + if not isinstance(pvalue, (float, int)) : + exit("ERROR: " + pname + " needs to be a numeric value!!") + if ptype == "class" : + not_matching = [x for x in pvalue.split(",") if x not in p["value"]] + if not_matching != [] : + exit("ERROR: " + pname + " needs to have one of these values: " + ' '.join(p["value"]) + "!!\n" + ' '.join(not_matching) + " did not match") + + + # Mass tolerances: do they include Da or ppm exclusively? + if pname == "fragment_mass_tolerance" or pname == "precursor_mass_tolerance" : + unit = pvalue.split(" ")[1] + if unit != "Da" and unit != "ppm" : + exit("ERROR: " + pname + " allows only units of \"Da\" and \"ppm\", separated by space from the value!!\nWe found " + unit) + + ## ENZYME AND MODIFICATIONS: LOOK UP ONTOLOGY VALUES + if pname == "enzyme" : + ols_out = olsclient.search(pvalue, ontology="MS", exact=True) + if ols_out == None or len(ols_out) > 1 : + exit("ERROR: enzyme " + pvalue + " not found in the MS ontology, see https://bioportal.bioontology.org/ontologies/MS/?p=classes&conceptid=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001045 for available terms") + pvalue = "NT=" + pvalue + ";AC=" + ols_out[0]["short_form"] + + + ## Modifications: look up in Unimod + if pname == "fixed_mods" or pname == "variable_mods" : + mods = pvalue.split(",") + for m in mods : + tmod = m.split(" of ") + if len(tmod) < 2: + exit("ERROR: Something wrong with the modification entry " + m + ". It should be PSI_MS_NAME of RESIDUE. Note that it should be single residues") + modname = tmod[0] + modpos = tmod[1] + found = [x for x in unimod.modifications if modname == x.get_name()] + if found == [] : + exit("ERROR: " + m + " not found in Unimod. Check the \"PSI-MS Names\" in unimod.org. Also check whether you used space between the comma separated modifications") + modtype = "fixed" + if (pname == "variable_mods") : modtype = "variable" + if re.match("[A-Z]",modpos) : + mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" + modtype + ";TA=" + modpos + elif modpos in ["Protein N-term", "Protein C-term", "Any N-term", "Any C-term"] : + mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" + modtype + ";PP=" + modpos + else : + exit("ERROR: Wrong residue given: " + modpos + ". Should be either one upper case letter or any of \"Protein N-term\", \"Protein C-term\", \"Any N-term\", \"Any C-term\"") + + +#print(found_list[0].get_name()) + + + ## Now finally writing the value + else : + sdrf_content[psdrf] = pvalue + + + + +## OVERWRITE RAW FILES IF GIVEN TO DIRECT TO THE CORRECT LOCATION? + +## ADD FASTA FILE TO SDRF (COMMENT:FASTA DATABASE FILE)? + +## WRITE EXPERIMENTAL DESIGN IF NO SDRF? + + +## adding modification columns +colnames = list(sdrf_content.columns) + ["comment[modification parameters]"] * len(mod_columns.columns) + +sdrf_content = pd.concat([sdrf_content, mod_columns], axis=1) + +print("--- Writing sdrf file into sdrf_local.tsv ---") +sdrf_content.to_csv("sdrf_local.tsv", sep="\t", header=colnames, index=False) + +## Verify with sdrf-parser +check_sdrf = SdrfDataFrame() +check_sdrf.parse("sdrf_local.tsv") +check_sdrf.validate("mass_spectrometry") + + +print("########## SUMMARY #########") +print("--- The following parameters have been overwritten in the sdrf file: ---") +for p in overwritten : + print(p) diff --git a/setup.py b/setup.py index 61d2418..be52dc5 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ long_description_content_type="text/markdown", long_description=long_description, license="'Apache 2.0", - data_files=[("", ["LICENSE", "sdrf_pipelines/openms/unimod.xml"])], + data_files=[("", ["LICENSE", "sdrf_pipelines/openms/unimod.xml", "sdrf_pipelines/maxquant/param2sdrf.yml"])], package_data={'': ['*.xml'], }, url="https://github.com/bigbio/sdrf-pipelines", packages=find_packages(), From ac1478804c6f0c3aa612865d98987d78b02f18ff Mon Sep 17 00:00:00 2001 From: veitveit Date: Thu, 27 Jan 2022 18:29:15 +0100 Subject: [PATCH 2/8] added various data analysis parameters --- ...drf-mapper.py => add_datanalysis_param.py} | 0 sdrf_pipelines/sdrf_merge/params2sdrf.yml | 223 ------------------ 2 files changed, 223 deletions(-) rename sdrf_pipelines/sdrf_merge/{sdrf-mapper.py => add_datanalysis_param.py} (100%) delete mode 100644 sdrf_pipelines/sdrf_merge/params2sdrf.yml diff --git a/sdrf_pipelines/sdrf_merge/sdrf-mapper.py b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py similarity index 100% rename from sdrf_pipelines/sdrf_merge/sdrf-mapper.py rename to sdrf_pipelines/sdrf_merge/add_datanalysis_param.py diff --git a/sdrf_pipelines/sdrf_merge/params2sdrf.yml b/sdrf_pipelines/sdrf_merge/params2sdrf.yml deleted file mode 100644 index 07e6e97..0000000 --- a/sdrf_pipelines/sdrf_merge/params2sdrf.yml +++ /dev/null @@ -1,223 +0,0 @@ -name: param2sdrf - -description: Mapping of parameters to run workflows with sdrf headers, including default values - -parameters: -- name: fixed_mods - type: ontology - sdrf: modification parameters - default: NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4 - comment: only as unimod names with residue(s) separate via comma - -- name: variable_mods - type: ontology - sdrf: modification parameters - default: NT=oxidation;MT=variable;TA=M;AC=UNIMOD:35 - comment: only as unimod names with residue(s) - -- name: precursor_mass_tolerance - type: string - sdrf: precursor mass tolerance - default: 30 ppm - -- name: fragment_mass_tolerance - type: string - sdrf: fragment mass tolerance - default: 0.05 Da - -- name: enzyme - type: ontology - sdrf: cleavage agent details - default: Trypsin - -#- name: type_cleavage -# type: class -# sdrf: type of cleavage -# default: - -- name: fions - type: class - sdrf: forward ions - default: b - value: - - a - - a1 - - b - - c - - c1 - - a1 - -- name: rions - type: class - sdrf: reverse ions - default: y - value: - - x - - y - - z - - z1 - - z2 - -- name: isotope_error_range - type: integer - sdrf: isotope error range - default: 0 - -- name: add_decoys - type: boolean - sdrf: add decoys - default: true - -- name: num_hits - type: integer - sdrf: num peptide hits - default: 1 - -- name: miscleavages - type: integer - sdrf: allowed miscleavages - default: 1 - -- name: min_precursor_charge - type: integer - sdrf: minimum precursor charge - default: 2 - -- name: max_precursor_charge - type: integer - sdrf: maximum precursor charge - default: 3 - -- name: min_peptide_length - type: integer - sdrf: minimum peptide length - default: 8 - -- name: max_peptide_length - type: integer - sdrf: maximum peptide length - default: 12 - -- name: max_mods - type: integer - sdrf: maximum allowed modification - default: 4 - -- name: fdr_psm - type: float - sdrf: fdr on psm level - default: 0.01 - -- name: fdr_peptide - type: float - sdrf: fdr on peptide level - default: 0.01 - -- name: fdr_protein - type: float - sdrf: fdr on protein level - default: 0.01 - -#- name: match_between_runs_rt_tol -#- type: float -#- sdrf: retention time tolerance match between runs -#- default: TODO - -#- name: match_between_runs_mz_tol -#- type: float -#- sdrf: m over z tolerance match between runs -#- default: TODO - -- name: match_between_runs - type: boolean - sdrf: run match between runs - default: true - -- name: protein_inference - type: class - sdrf: protein inference method - default: unique - value: - - unique - - shared - - parsimony - - other - -- name: quantification_method - type: class - sdrf: quantification method - default: intensity - value: - - precursor - - ms2 - - labelled - - other - -- name: summarization_proteins - type: class - sdrf: summarization of proteins method - default: sum_abs - value: - - sum_abs - - mean - - median - - other - comment: sum_abs is the only one on non-transformed data - -- name: min_num_peptides - type: integer - sdrf: minimum number of peptides per protein - default: 2 - -- name: summarization_psms - type: class - sdrf: summarization of psms method - default: sum_abs - value: - - sum_abs - - mean - - median - - other - - -- name: quant_transformation - type: class - sdrf: transformation of quantitative values - default: log - value: - - log - - rlog - - none - - other - -- name: normalization_method - type: class - sdrf: normalization method - default: median - value: - - mean - - median - - quantile - - linear - - non-linear - - other - -- name: run_statistics - type: boolean - sdrf: run statistical tests - default: true - -- name: fdr_method - type: class - sdrf: method for correction of multiple testing - default: benjamini-hochberg - value: - - benjamini-hochberg - - bonferroni - - qvalue - - other - -- name: fdr_threshold - type: float - sdrf: threshold for statistical test fdr - default: 0.01 From fd97ffded714d409620cbffd4c5cee07338223db Mon Sep 17 00:00:00 2001 From: veitveit Date: Fri, 28 Jan 2022 12:47:50 +0100 Subject: [PATCH 3/8] corrected style --- sdrf_pipelines/maxquant/maxquant.py | 14 ++++---- .../sdrf_merge/add_datanalysis_param.py | 35 +++++++++---------- setup.py | 2 +- 3 files changed, 24 insertions(+), 27 deletions(-) diff --git a/sdrf_pipelines/maxquant/maxquant.py b/sdrf_pipelines/maxquant/maxquant.py index 5852ae4..c23db2a 100644 --- a/sdrf_pipelines/maxquant/maxquant.py +++ b/sdrf_pipelines/maxquant/maxquant.py @@ -649,7 +649,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns sdrf.columns = map(str.lower, sdrf.columns) # convert column names to lower-case with open(self.datparamfile) as file: - param_mapping = yaml.load(file, Loader=yaml.FullLoader) + param_mapping = yaml.safe_load(file) mapping = param_mapping["parameters"] datparams = dict() for i in mapping: @@ -797,7 +797,6 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns else: file2technical_rep[raw] = "1" - # store highest replicate number for this source name if source_name in source_name2n_reps: source_name2n_reps[source_name] = max(int(source_name2n_reps[source_name]), @@ -1301,10 +1300,9 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns warning_message = "multiple values for parameter minimum peptide length, taking the first: " + first self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1 else: - minPepLen.appendChild(doc.createTextNode('7')) + minPepLen.appendChild(doc.createTextNode('7')) root.appendChild(minPepLen) - psmFdrCrosslink = doc.createElement('psmFdrCrosslink') psmFdrCrosslink.appendChild(doc.createTextNode('0.01')) root.appendChild(psmFdrCrosslink) @@ -1324,7 +1322,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns root.appendChild(peptideFdr) proteinFdr = doc.createElement('proteinFdr') - tparam = file2params["fdr_protein"] + tparam = file2params["fdr_protein"] if (len(tparam) > 0): first = list(tparam.values())[0] warning_message = "overwriting protein FDR using the value in the sdrf file" @@ -1338,7 +1336,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns root.appendChild(proteinFdr) siteFdr = doc.createElement('siteFdr') - tparam = file2params["fdr_psm"] + tparam = file2params["fdr_psm"] if (len(tparam) > 0): first = list(tparam.values())[0] warning_message = "overwriting PSM FDR using the value in the sdrf file" @@ -1364,7 +1362,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns root.appendChild(useNormRatiosForOccupancy) minPeptides = doc.createElement('minPeptides') - tparam = file2params["min_num_peptides"] + tparam = file2params["min_num_peptides"] if (len(tparam) > 0): first = list(tparam.values())[0] minPeptides.appendChild(doc.createTextNode(first)) @@ -1445,7 +1443,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns quantMode = doc.createElement('quantMode') - tparam = file2params["quantification_method"] + tparam = file2params["quantification_method"] if (len(tparam) > 0): first = list(tparam.values())[0] if first == "unique": diff --git a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py index 6a649a2..078472e 100644 --- a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py +++ b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py @@ -1,9 +1,9 @@ import pandas as pd import re import yaml -import sdrf_pipelines +#import sdrf_pipelines import os.path -from sdrf_pipelines.zooma.zooma import Zooma, OlsClient +from sdrf_pipelines.zooma.zooma import OlsClient from sdrf_pipelines.openms.unimod import UnimodDatabase from sdrf_pipelines.sdrf.sdrf import SdrfDataFrame @@ -20,13 +20,13 @@ overwritten = set() with open(r'param2sdrf.yml') as file: - param_mapping = yaml.load(file, Loader=yaml.FullLoader) + param_mapping = yaml.safe_load(file) mapping = param_mapping["parameters"] ## READ PARAMETERS FOR RUNNING WORKFLOW with open(r'params.yml') as file: - tparams_in = yaml.load(file, Loader=yaml.FullLoader) + tparams_in = yaml.safe_load(file) params_in = tparams_in["params"] rawfiles = tparams_in["rawfiles"] fastafile = tparams_in["fastafile"] @@ -49,13 +49,12 @@ ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=variable")) ] mod_columns.drop(ttt, axis=1, inplace=True) overwritten.add("variable_mods") - else: ## THROW ERROR FOR MISSING SDRF exit("ERROR: No SDRF file given. Add an at least minimal version\nFor more details, see https://github.com/bigbio/proteomics-metadata-standard/tree/master/sdrf-proteomics") - + ## FIRST STANDARD PARAMETERS # FOR GIVEN PARAMETERS # CHECK WHETHER COLUMN IN SDRF TO PUT WARNING AND OVERWRITE @@ -83,32 +82,32 @@ # for each type: check consistency #print(type(pvalue)) - if ptype == "boolean" : + if ptype == "boolean": if not isinstance(pvalue, bool) : exit("ERROR: " + pname + " needs to be either \"true\" or \"false\"!!") - if ptype == "str" : + if ptype == "str": if not isinstance(pvalue, str) : exit("ERROR: " + pname + " needs to be a string!!") - if ptype == "integer" : + if ptype == "integer": if not isinstance(pvalue, int) : exit("ERROR: " + pname + " needs to be a string!!") - if ptype == "float" : - if not isinstance(pvalue, (float, int)) : + if ptype == "float": + if not isinstance(pvalue, (float, int)): exit("ERROR: " + pname + " needs to be a numeric value!!") - if ptype == "class" : + if ptype == "class": not_matching = [x for x in pvalue.split(",") if x not in p["value"]] if not_matching != [] : exit("ERROR: " + pname + " needs to have one of these values: " + ' '.join(p["value"]) + "!!\n" + ' '.join(not_matching) + " did not match") # Mass tolerances: do they include Da or ppm exclusively? - if pname == "fragment_mass_tolerance" or pname == "precursor_mass_tolerance" : - unit = pvalue.split(" ")[1] + if pname == "fragment_mass_tolerance" or pname == "precursor_mass_tolerance": + unit = pvalue.split(" ")[1] if unit != "Da" and unit != "ppm" : exit("ERROR: " + pname + " allows only units of \"Da\" and \"ppm\", separated by space from the value!!\nWe found " + unit) ## ENZYME AND MODIFICATIONS: LOOK UP ONTOLOGY VALUES - if pname == "enzyme" : + if pname == "enzyme": ols_out = olsclient.search(pvalue, ontology="MS", exact=True) if ols_out == None or len(ols_out) > 1 : exit("ERROR: enzyme " + pvalue + " not found in the MS ontology, see https://bioportal.bioontology.org/ontologies/MS/?p=classes&conceptid=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001045 for available terms") @@ -128,7 +127,7 @@ if found == [] : exit("ERROR: " + m + " not found in Unimod. Check the \"PSI-MS Names\" in unimod.org. Also check whether you used space between the comma separated modifications") modtype = "fixed" - if (pname == "variable_mods") : modtype = "variable" + if (pname == "variable_mods") : modtype = "variable" if re.match("[A-Z]",modpos) : mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" + modtype + ";TA=" + modpos elif modpos in ["Protein N-term", "Protein C-term", "Any N-term", "Any C-term"] : @@ -141,7 +140,7 @@ ## Now finally writing the value - else : + else : sdrf_content[psdrf] = pvalue @@ -160,7 +159,7 @@ sdrf_content = pd.concat([sdrf_content, mod_columns], axis=1) print("--- Writing sdrf file into sdrf_local.tsv ---") -sdrf_content.to_csv("sdrf_local.tsv", sep="\t", header=colnames, index=False) +sdrf_content.to_csv("sdrf_local.tsv", sep="\t", header=colnames, index=False) ## Verify with sdrf-parser check_sdrf = SdrfDataFrame() diff --git a/setup.py b/setup.py index be52dc5..b77a31d 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ long_description_content_type="text/markdown", long_description=long_description, license="'Apache 2.0", - data_files=[("", ["LICENSE", "sdrf_pipelines/openms/unimod.xml", "sdrf_pipelines/maxquant/param2sdrf.yml"])], + data_files=[("", ["LICENSE", "sdrf_pipelines/openms/unimod.xml", "sdrf_pipelines/sdrf_merge/param2sdrf.yml"])], package_data={'': ['*.xml'], }, url="https://github.com/bigbio/sdrf-pipelines", packages=find_packages(), From 275c3d1a74681aa9f00c0937f26e5281409ba26d Mon Sep 17 00:00:00 2001 From: veitveit Date: Fri, 11 Feb 2022 07:50:50 +0100 Subject: [PATCH 4/8] simplified loop --- sdrf_pipelines/maxquant/maxquant.py | 6 +- .../sdrf_merge/add_datanalysis_param.py | 107 +++++++++--------- 2 files changed, 56 insertions(+), 57 deletions(-) diff --git a/sdrf_pipelines/maxquant/maxquant.py b/sdrf_pipelines/maxquant/maxquant.py index c23db2a..45928d5 100644 --- a/sdrf_pipelines/maxquant/maxquant.py +++ b/sdrf_pipelines/maxquant/maxquant.py @@ -1021,10 +1021,10 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns comment_p = 'comment[' + p + ']' if comment_p in row: file2params[datparams[p]][raw] = row[comment_p] - - - + + + # create maxquant parameters xml file doc = Document() diff --git a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py index 078472e..43b87c4 100644 --- a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py +++ b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py @@ -48,8 +48,7 @@ if "variable_mods" in params_in.keys(): ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=variable")) ] mod_columns.drop(ttt, axis=1, inplace=True) - overwritten.add("variable_mods") - + overwritten.add("variable_mods") else: ## THROW ERROR FOR MISSING SDRF exit("ERROR: No SDRF file given. Add an at least minimal version\nFor more details, see https://github.com/bigbio/proteomics-metadata-standard/tree/master/sdrf-proteomics") @@ -65,7 +64,7 @@ print("---- Parameter: " + pname + ": ----") if(pname in list(params_in.keys())) : print("Found in parameter file") - pvalue = params_in[pname] + pvalue = params_in[pname] else: print("Setting to default: " + p["default"]) pvalue = p["default"] @@ -80,60 +79,60 @@ print("WARNING: Overwriting values in sdrf file with " + pvalue) overwritten.add(pname) - # for each type: check consistency - #print(type(pvalue)) - if ptype == "boolean": - if not isinstance(pvalue, bool) : - exit("ERROR: " + pname + " needs to be either \"true\" or \"false\"!!") - if ptype == "str": - if not isinstance(pvalue, str) : - exit("ERROR: " + pname + " needs to be a string!!") - if ptype == "integer": - if not isinstance(pvalue, int) : - exit("ERROR: " + pname + " needs to be a string!!") - if ptype == "float": - if not isinstance(pvalue, (float, int)): - exit("ERROR: " + pname + " needs to be a numeric value!!") - if ptype == "class": - not_matching = [x for x in pvalue.split(",") if x not in p["value"]] - if not_matching != [] : - exit("ERROR: " + pname + " needs to have one of these values: " + ' '.join(p["value"]) + "!!\n" + ' '.join(not_matching) + " did not match") - - - # Mass tolerances: do they include Da or ppm exclusively? - if pname == "fragment_mass_tolerance" or pname == "precursor_mass_tolerance": - unit = pvalue.split(" ")[1] - if unit != "Da" and unit != "ppm" : - exit("ERROR: " + pname + " allows only units of \"Da\" and \"ppm\", separated by space from the value!!\nWe found " + unit) - - ## ENZYME AND MODIFICATIONS: LOOK UP ONTOLOGY VALUES - if pname == "enzyme": - ols_out = olsclient.search(pvalue, ontology="MS", exact=True) - if ols_out == None or len(ols_out) > 1 : - exit("ERROR: enzyme " + pvalue + " not found in the MS ontology, see https://bioportal.bioontology.org/ontologies/MS/?p=classes&conceptid=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001045 for available terms") - pvalue = "NT=" + pvalue + ";AC=" + ols_out[0]["short_form"] + # for each type: check consistency + #print(type(pvalue)) + if ptype == "boolean": + if not isinstance(pvalue, bool) : + exit("ERROR: " + pname + " needs to be either \"true\" or \"false\"!!") + elif ptype == "str": + if not isinstance(pvalue, str) : + exit("ERROR: " + pname + " needs to be a string!!") + elif ptype == "integer": + if not isinstance(pvalue, int) : + exit("ERROR: " + pname + " needs to be a string!!") + elif ptype == "float": + if not isinstance(pvalue, (float, int)): + exit("ERROR: " + pname + " needs to be a numeric value!!") + elif ptype == "class": + not_matching = [x for x in pvalue.split(",") if x not in p["value"]] + if not_matching != [] : + exit("ERROR: " + pname + " needs to have one of these values: " + ' '.join(p["value"]) + "!!\n" + ' '.join(not_matching) + " did not match") + + + # Mass tolerances: do they include Da or ppm exclusively? + if pname == "fragment_mass_tolerance" or pname == "precursor_mass_tolerance": + unit = pvalue.split(" ")[1] + if unit != "Da" and unit != "ppm" : + exit("ERROR: " + pname + " allows only units of \"Da\" and \"ppm\", separated by space from the value!!\nWe found " + unit) + + ## ENZYME AND MODIFICATIONS: LOOK UP ONTOLOGY VALUES + elif pname == "enzyme": + ols_out = olsclient.search(pvalue, ontology="MS", exact=True) + if ols_out == None or len(ols_out) > 1 : + exit("ERROR: enzyme " + pvalue + " not found in the MS ontology, see https://bioportal.bioontology.org/ontologies/MS/?p=classes&conceptid=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001045 for available terms") + pvalue = "NT=" + pvalue + ";AC=" + ols_out[0]["short_form"] ## Modifications: look up in Unimod - if pname == "fixed_mods" or pname == "variable_mods" : - mods = pvalue.split(",") - for m in mods : - tmod = m.split(" of ") - if len(tmod) < 2: - exit("ERROR: Something wrong with the modification entry " + m + ". It should be PSI_MS_NAME of RESIDUE. Note that it should be single residues") - modname = tmod[0] - modpos = tmod[1] - found = [x for x in unimod.modifications if modname == x.get_name()] - if found == [] : - exit("ERROR: " + m + " not found in Unimod. Check the \"PSI-MS Names\" in unimod.org. Also check whether you used space between the comma separated modifications") - modtype = "fixed" - if (pname == "variable_mods") : modtype = "variable" - if re.match("[A-Z]",modpos) : - mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" + modtype + ";TA=" + modpos - elif modpos in ["Protein N-term", "Protein C-term", "Any N-term", "Any C-term"] : - mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" + modtype + ";PP=" + modpos - else : - exit("ERROR: Wrong residue given: " + modpos + ". Should be either one upper case letter or any of \"Protein N-term\", \"Protein C-term\", \"Any N-term\", \"Any C-term\"") + elif pname == "fixed_mods" or pname == "variable_mods" : + mods = pvalue.split(",") + for m in mods : + tmod = m.split(" of ") + if len(tmod) < 2: + exit("ERROR: Something wrong with the modification entry " + m + ". It should be PSI_MS_NAME of RESIDUE. Note that it should be single residues") + modname = tmod[0] + modpos = tmod[1] + found = [x for x in unimod.modifications if modname == x.get_name()] + if found == [] : + exit("ERROR: " + m + " not found in Unimod. Check the \"PSI-MS Names\" in unimod.org. Also check whether you used space between the comma separated modifications") + modtype = "fixed" + if (pname == "variable_mods") : modtype = "variable" + if re.match("[A-Z]",modpos) : + mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" + modtype + ";TA=" + modpos + elif modpos in ["Protein N-term", "Protein C-term", "Any N-term", "Any C-term"] : + mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" + modtype + ";PP=" + modpos + else : + exit("ERROR: Wrong residue given: " + modpos + ". Should be either one upper case letter or any of \"Protein N-term\", \"Protein C-term\", \"Any N-term\", \"Any C-term\"") #print(found_list[0].get_name()) From 7d5483f9ee5406e95a1f08da279120b86c4246d5 Mon Sep 17 00:00:00 2001 From: veitveit Date: Fri, 11 Feb 2022 07:58:37 +0100 Subject: [PATCH 5/8] simplified loop --- .../sdrf_merge/add_datanalysis_param.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py index 43b87c4..606105d 100644 --- a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py +++ b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py @@ -64,7 +64,7 @@ print("---- Parameter: " + pname + ": ----") if(pname in list(params_in.keys())) : print("Found in parameter file") - pvalue = params_in[pname] + pvalue = params_in[pname] else: print("Setting to default: " + p["default"]) pvalue = p["default"] @@ -74,10 +74,10 @@ if (psdrf) in sdrf_content.keys() : - if (len(set(sdrf_content[psdrf])) > 1) : + if (len(set(sdrf_content[psdrf])) > 1) : exit("ERROR: multiple values for parameter " + pname + " in sdrf file\n We recommend separating the file into parts with the same data analysis parameters") - print("WARNING: Overwriting values in sdrf file with " + pvalue) - overwritten.add(pname) + print("WARNING: Overwriting values in sdrf file with " + pvalue) + overwritten.add(pname) # for each type: check consistency #print(type(pvalue)) @@ -88,15 +88,15 @@ if not isinstance(pvalue, str) : exit("ERROR: " + pname + " needs to be a string!!") elif ptype == "integer": - if not isinstance(pvalue, int) : + if not isinstance(pvalue, int) : exit("ERROR: " + pname + " needs to be a string!!") elif ptype == "float": if not isinstance(pvalue, (float, int)): exit("ERROR: " + pname + " needs to be a numeric value!!") elif ptype == "class": - not_matching = [x for x in pvalue.split(",") if x not in p["value"]] - if not_matching != [] : - exit("ERROR: " + pname + " needs to have one of these values: " + ' '.join(p["value"]) + "!!\n" + ' '.join(not_matching) + " did not match") + not_matching = [x for x in pvalue.split(",") if x not in p["value"]] + if not_matching != [] : + exit("ERROR: " + pname + " needs to have one of these values: " + ' '.join(p["value"]) + "!!\n" + ' '.join(not_matching) + " did not match") # Mass tolerances: do they include Da or ppm exclusively? @@ -110,7 +110,7 @@ ols_out = olsclient.search(pvalue, ontology="MS", exact=True) if ols_out == None or len(ols_out) > 1 : exit("ERROR: enzyme " + pvalue + " not found in the MS ontology, see https://bioportal.bioontology.org/ontologies/MS/?p=classes&conceptid=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001045 for available terms") - pvalue = "NT=" + pvalue + ";AC=" + ols_out[0]["short_form"] + pvalue = "NT=" + pvalue + ";AC=" + ols_out[0]["short_form"] ## Modifications: look up in Unimod From 72eae10c23037feaae4db52d4570b517021975a4 Mon Sep 17 00:00:00 2001 From: veitveit Date: Fri, 11 Feb 2022 08:05:43 +0100 Subject: [PATCH 6/8] simplified loop --- sdrf_pipelines/maxquant/maxquant.py | 4 ++-- sdrf_pipelines/sdrf_merge/add_datanalysis_param.py | 7 ++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/sdrf_pipelines/maxquant/maxquant.py b/sdrf_pipelines/maxquant/maxquant.py index 45928d5..28c8791 100644 --- a/sdrf_pipelines/maxquant/maxquant.py +++ b/sdrf_pipelines/maxquant/maxquant.py @@ -1635,8 +1635,8 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns for p in file2params.keys(): if (len(file2params[p]) > 0): datanalysisparams[p] = file2params[p][key1] - - + + if tag == 0 and tmp == []: int_node = doc.createElement('int') int_node.appendChild(doc.createTextNode('0')) diff --git a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py index 606105d..81e0862 100644 --- a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py +++ b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py @@ -48,7 +48,7 @@ if "variable_mods" in params_in.keys(): ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=variable")) ] mod_columns.drop(ttt, axis=1, inplace=True) - overwritten.add("variable_mods") + overwritten.add("variable_mods") else: ## THROW ERROR FOR MISSING SDRF exit("ERROR: No SDRF file given. Add an at least minimal version\nFor more details, see https://github.com/bigbio/proteomics-metadata-standard/tree/master/sdrf-proteomics") @@ -71,8 +71,6 @@ psdrf = "comment[" + p["sdrf"] + "]" - - if (psdrf) in sdrf_content.keys() : if (len(set(sdrf_content[psdrf])) > 1) : exit("ERROR: multiple values for parameter " + pname + " in sdrf file\n We recommend separating the file into parts with the same data analysis parameters") @@ -112,8 +110,7 @@ exit("ERROR: enzyme " + pvalue + " not found in the MS ontology, see https://bioportal.bioontology.org/ontologies/MS/?p=classes&conceptid=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001045 for available terms") pvalue = "NT=" + pvalue + ";AC=" + ols_out[0]["short_form"] - - ## Modifications: look up in Unimod + ## Modifications: look up in Unimod elif pname == "fixed_mods" or pname == "variable_mods" : mods = pvalue.split(",") for m in mods : From 05e0d5579f6fe13608033cfba7711b9d3c346236 Mon Sep 17 00:00:00 2001 From: veitveit Date: Fri, 11 Feb 2022 08:17:40 +0100 Subject: [PATCH 7/8] simplified loop --- sdrf_pipelines/maxquant/maxquant.py | 8 ++++---- sdrf_pipelines/sdrf_merge/add_datanalysis_param.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/sdrf_pipelines/maxquant/maxquant.py b/sdrf_pipelines/maxquant/maxquant.py index 28c8791..a8f4200 100644 --- a/sdrf_pipelines/maxquant/maxquant.py +++ b/sdrf_pipelines/maxquant/maxquant.py @@ -1646,7 +1646,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns if 'Lys8' in file2label[key1] or 'Arg10' in file2label[key1] or 'Arg6' in \ file2label[key1] or 'Lys6' in file2label[key1]: parameterGroup['0']['silac_shape'] = file2silac_shape[key1] - + elif {instr_val: value2} in tmp: int_node = doc.createElement('int') int_text = doc.createTextNode(str(tag)) @@ -1660,7 +1660,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns tmp.append({instr_val: value2}) parameterGroup[str(tag)] = {'instrument': file2instrument[key1], 'label': file2label[key1] , 'mods': file2mods[key1], 'enzyme': file2enzyme[key1], 'pctol': file2pctol[key1], 'fragtol': file2fragtol[key1],'pctolunit': file2pctolunit[key1], 'fragtolunit': file2fragtolunit[key1], 'datparams': datparams} - + if 'Lys8' in file2label[key1] or 'Arg10' in file2label[key1] or 'Arg6' in file2label[key1] \ or 'Lys6' in file2label[key1]: parameterGroup[str(tag)]['silac_shape'] = file2silac_shape[key1] @@ -1901,7 +1901,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns maxNmods.appendChild(doc.createTextNode('5')) - + maxMissedCleavages = doc.createElement('maxMissedCleavages') maxMissedCleavages.appendChild(doc.createTextNode('2')) enzymeMode = doc.createElement('enzymeMode') @@ -2030,7 +2030,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns searchTolInPpm = doc.createElement('searchTolInPpm') searchTolInPpm.appendChild(doc.createTextNode('False')) isotopeMatchTol = doc.createElement('isotopeMatchTol') - + isotopeMatchTol.appendChild(doc.createTextNode('2')) isotopeMatchTolInPpm = doc.createElement('isotopeMatchTolInPpm') isotopeMatchTolInPpm.appendChild(doc.createTextNode('True')) diff --git a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py index 81e0862..588a800 100644 --- a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py +++ b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py @@ -138,9 +138,9 @@ ## Now finally writing the value else : sdrf_content[psdrf] = pvalue - - + + ## OVERWRITE RAW FILES IF GIVEN TO DIRECT TO THE CORRECT LOCATION? From ad64539f6d85d16035d6b6f462513ad26f475238 Mon Sep 17 00:00:00 2001 From: veitveit Date: Thu, 17 Mar 2022 19:42:04 +0100 Subject: [PATCH 8/8] become flake8 compliant --- .../sdrf_merge/add_datanalysis_param.py | 262 ++++++++++-------- 1 file changed, 145 insertions(+), 117 deletions(-) diff --git a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py index 588a800..f3252f6 100644 --- a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py +++ b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py @@ -1,169 +1,197 @@ import pandas as pd import re import yaml -#import sdrf_pipelines import os.path from sdrf_pipelines.zooma.zooma import OlsClient from sdrf_pipelines.openms.unimod import UnimodDatabase from sdrf_pipelines.sdrf.sdrf import SdrfDataFrame -## Accessing ontologies and CVs +# Accessing ontologies and CVs unimod = UnimodDatabase() olsclient = OlsClient() -#print(ols_out) +# print(ols_out) + +field_types = {"boolean": bool, "str": str, "integer": int, "float": (float, int)} + + +# Function for consistency checks +def verify_content(pname, pvalue, ptype): + # for each type: check consistency + # print(type(pvalue)) + if ptype in field_types.keys(): + if not isinstance(pvalue, field_types[ptype]): + exit("ERROR: " + pname + " needs to be " + ptype + "!!") +# if ptype == "boolean": +# if not isinstance(pvalue, bool): +# exit("ERROR: " + pname + " needs to be either \"true\" or \"false\"!!") +# elif ptype == "str": +# if not isinstance(pvalue, str): +# exit("ERROR: " + pname + " needs to be a string!!") +# elif ptype == "integer": +# if not isinstance(pvalue, int): +# exit("ERROR: " + pname + " needs to be a string!!") +# elif ptype == "float": +# if not isinstance(pvalue, (float, int)): +# exit("ERROR: " + pname + " needs to be a numeric value!!") + elif ptype == "class": + not_matching = [x for x in pvalue.split(",") if x not in p["value"]] + if not_matching != []: + exit("ERROR: " + pname + " needs to have one of these values: " + ' '.join(p["value"]) + "!!\n" + + ' '.join(not_matching) + " did not match") + + # Mass tolerances: do they include Da or ppm exclusively? + if pname == "fragment_mass_tolerance" or pname == "precursor_mass_tolerance": + unit = pvalue.split(" ")[1] + if unit != "Da" and unit != "ppm": + exit("ERROR: " + pname + " allows only units of \"Da\" and \"ppm\", separated by space from the \ +value!!\nWe found " + unit) + # ENZYME AND MODIFICATIONS: LOOK UP ONTOLOGY VALUES + elif pname == "enzyme": + ols_out = olsclient.search(pvalue, ontology="MS", exact=True) + if ols_out is None: + exit("ERROR: enzyme " + pvalue + " not found in the MS ontology, see \ +https://bioportal.bioontology.org/ontologies/MS/?p=classes&conceptid=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001045 \ +for available terms") + pvalue = "NT=" + pvalue + ";AC=" + ols_out[0]["short_form"] + return pvalue + + +def new_or_default(params_in, pname, p): + if(pname in list(params_in.keys())): + print("Found in parameter file") + pvalue = params_in[pname] + else: + print("Setting to default: " + p["default"]) + pvalue = p["default"] + return(pvalue) + + +# Function to load modifications +def add_ptms(mods, pname, mod_columns): + for m in mods: + tmod = m.split(" of ") + if len(tmod) < 2: + exit("ERROR: Something wrong with the modification entry " + m + ". It should be PSI_MS_NAME of RESIDUE. \ +Note that it should be single residues") + modname = tmod[0] + modpos = tmod[1] + found = [x for x in unimod.modifications if modname == x.get_name()] + if found == []: + exit("ERROR: " + m + " not found in Unimod. Check the \"PSI-MS Names\" in unimod.org. Also check whether you \ +used space between the comma separated modifications") + modtype = pname.replace("_mods", "") + if re.match("[A-Z]", modpos): + mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" +\ + modtype + ";TA=" + modpos + elif modpos in ["Protein N-term", "Protein C-term", "Any N-term", "Any C-term"]: + mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" +\ + modtype + ";PP=" + modpos + else: + exit("ERROR: Wrong residue given: " + modpos + ". Should be either one upper case letter or any of \"Protein N-term\", \ + \"Protein C-term\", \"Any N-term\", \"Any C-term\"") + return mod_columns + # modifications have the same column name, not working with pandas # therefore separated mod_columns = pd.DataFrame() -## For summary at the end +# For summary at the end overwritten = set() with open(r'param2sdrf.yml') as file: - param_mapping = yaml.safe_load(file) - mapping = param_mapping["parameters"] + param_mapping = yaml.safe_load(file) + mapping = param_mapping["parameters"] -## READ PARAMETERS FOR RUNNING WORKFLOW +# READ PARAMETERS FOR RUNNING WORKFLOW with open(r'params.yml') as file: - tparams_in = yaml.safe_load(file) - params_in = tparams_in["params"] - rawfiles = tparams_in["rawfiles"] - fastafile = tparams_in["fastafile"] - + tparams_in = yaml.safe_load(file) + params_in = tparams_in["params"] + rawfiles = tparams_in["rawfiles"] + fastafile = tparams_in["fastafile"] - -## WE NEED AN SDRF FILE FOR THE EXPERIMENTAL DESIGN, CONTAINING FILE LOCATIONS +# WE NEED AN SDRF FILE FOR THE EXPERIMENTAL DESIGN, CONTAINING FILE LOCATIONS sdrf_content = pd.DataFrame() has_sdrf = os.path.isfile("./sdrf.tsv") -if has_sdrf : - sdrf_content = pd.read_csv("sdrf.tsv", sep="\t") - mod_columns = sdrf_content.filter(like="comment[modification parameters]") - sdrf_content = sdrf_content.drop(columns=mod_columns.columns) - # delete columns with fixed/variable modification info - if "fixed_mods" in params_in.keys(): - ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=fixed")) ] - mod_columns.drop(ttt, axis=1, inplace=True) - overwritten.add("fixed_mods") - if "variable_mods" in params_in.keys(): - ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=variable")) ] - mod_columns.drop(ttt, axis=1, inplace=True) - overwritten.add("variable_mods") +if has_sdrf: + sdrf_content = pd.read_csv("sdrf.tsv", sep="\t") + mod_columns = sdrf_content.filter(like="comment[modification parameters]") + sdrf_content = sdrf_content.drop(columns=mod_columns.columns) + sdrf_content["comment[modification parameters]"] = None + # delete columns with fixed/variable modification info + if "fixed_mods" in params_in.keys(): + ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=fixed"))] + mod_columns.drop(ttt, axis=1, inplace=True) + overwritten.add("fixed_mods") + if "variable_mods" in params_in.keys(): + ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=variable"))] + mod_columns.drop(ttt, axis=1, inplace=True) + overwritten.add("variable_mods") else: - ## THROW ERROR FOR MISSING SDRF - exit("ERROR: No SDRF file given. Add an at least minimal version\nFor more details, see https://github.com/bigbio/proteomics-metadata-standard/tree/master/sdrf-proteomics") + # THROW ERROR FOR MISSING SDRF + exit("ERROR: No SDRF file given. Add an at least minimal version\nFor more details, \ +see https://github.com/bigbio/proteomics-metadata-standard/tree/master/sdrf-proteomics") -## FIRST STANDARD PARAMETERS +# FIRST STANDARD PARAMETERS # FOR GIVEN PARAMETERS # CHECK WHETHER COLUMN IN SDRF TO PUT WARNING AND OVERWRITE # IF NOT GIVEN, WRITE COLUMN for p in mapping: - pname = p["name"] - ptype = p["type"] - print("---- Parameter: " + pname + ": ----") - if(pname in list(params_in.keys())) : - print("Found in parameter file") - pvalue = params_in[pname] - else: - print("Setting to default: " + p["default"]) - pvalue = p["default"] - - psdrf = "comment[" + p["sdrf"] + "]" - - if (psdrf) in sdrf_content.keys() : - if (len(set(sdrf_content[psdrf])) > 1) : - exit("ERROR: multiple values for parameter " + pname + " in sdrf file\n We recommend separating the file into parts with the same data analysis parameters") - print("WARNING: Overwriting values in sdrf file with " + pvalue) - overwritten.add(pname) - - # for each type: check consistency - #print(type(pvalue)) - if ptype == "boolean": - if not isinstance(pvalue, bool) : - exit("ERROR: " + pname + " needs to be either \"true\" or \"false\"!!") - elif ptype == "str": - if not isinstance(pvalue, str) : - exit("ERROR: " + pname + " needs to be a string!!") - elif ptype == "integer": - if not isinstance(pvalue, int) : - exit("ERROR: " + pname + " needs to be a string!!") - elif ptype == "float": - if not isinstance(pvalue, (float, int)): - exit("ERROR: " + pname + " needs to be a numeric value!!") - elif ptype == "class": - not_matching = [x for x in pvalue.split(",") if x not in p["value"]] - if not_matching != [] : - exit("ERROR: " + pname + " needs to have one of these values: " + ' '.join(p["value"]) + "!!\n" + ' '.join(not_matching) + " did not match") - - - # Mass tolerances: do they include Da or ppm exclusively? - if pname == "fragment_mass_tolerance" or pname == "precursor_mass_tolerance": - unit = pvalue.split(" ")[1] - if unit != "Da" and unit != "ppm" : - exit("ERROR: " + pname + " allows only units of \"Da\" and \"ppm\", separated by space from the value!!\nWe found " + unit) - - ## ENZYME AND MODIFICATIONS: LOOK UP ONTOLOGY VALUES - elif pname == "enzyme": - ols_out = olsclient.search(pvalue, ontology="MS", exact=True) - if ols_out == None or len(ols_out) > 1 : - exit("ERROR: enzyme " + pvalue + " not found in the MS ontology, see https://bioportal.bioontology.org/ontologies/MS/?p=classes&conceptid=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001045 for available terms") - pvalue = "NT=" + pvalue + ";AC=" + ols_out[0]["short_form"] + pname = p["name"] + ptype = p["type"] + print("---- Parameter: " + pname + ": ----") - ## Modifications: look up in Unimod - elif pname == "fixed_mods" or pname == "variable_mods" : - mods = pvalue.split(",") - for m in mods : - tmod = m.split(" of ") - if len(tmod) < 2: - exit("ERROR: Something wrong with the modification entry " + m + ". It should be PSI_MS_NAME of RESIDUE. Note that it should be single residues") - modname = tmod[0] - modpos = tmod[1] - found = [x for x in unimod.modifications if modname == x.get_name()] - if found == [] : - exit("ERROR: " + m + " not found in Unimod. Check the \"PSI-MS Names\" in unimod.org. Also check whether you used space between the comma separated modifications") - modtype = "fixed" - if (pname == "variable_mods") : modtype = "variable" - if re.match("[A-Z]",modpos) : - mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" + modtype + ";TA=" + modpos - elif modpos in ["Protein N-term", "Protein C-term", "Any N-term", "Any C-term"] : - mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" + modtype + ";PP=" + modpos - else : - exit("ERROR: Wrong residue given: " + modpos + ". Should be either one upper case letter or any of \"Protein N-term\", \"Protein C-term\", \"Any N-term\", \"Any C-term\"") + pvalue = new_or_default(params_in, pname, p) + psdrf = "comment[" + p["sdrf"] + "]" + if psdrf in sdrf_content.keys(): + if (len(set(sdrf_content[psdrf])) > 1): + exit("ERROR: multiple values for parameter " + pname + " in sdrf file\n We recommend separating \ +the file into parts with the same data analysis parameters") -#print(found_list[0].get_name()) + pvalue = verify_content(pname, pvalue, ptype) + # Modifications: look up in Unimod + if pname in ["fixed_mods", "variable_mods"] and pname in overwritten: + mods = pvalue.split(",") + print("WARNING: Overwriting " + pname + " values in sdrf file with " + pvalue) + mod_columns = add_ptms(mods, pname, mod_columns) - ## Now finally writing the value - else : - sdrf_content[psdrf] = pvalue + # Now finally writing the value + elif pname not in ["fixed_mods", "variable_mods"]: + print("WARNING: Overwriting " + pname + " values in sdrf file with " + pvalue) + overwritten.add(pname) + sdrf_content[psdrf] = pvalue + else: + sdrf_content[psdrf] = pvalue +# OVERWRITE RAW FILES IF GIVEN TO DIRECT TO THE CORRECT LOCATION? +# ADD FASTA FILE TO SDRF (COMMENT:FASTA DATABASE FILE)? -## OVERWRITE RAW FILES IF GIVEN TO DIRECT TO THE CORRECT LOCATION? +# WRITE EXPERIMENTAL DESIGN IF NO SDRF? -## ADD FASTA FILE TO SDRF (COMMENT:FASTA DATABASE FILE)? - -## WRITE EXPERIMENTAL DESIGN IF NO SDRF? - - -## adding modification columns -colnames = list(sdrf_content.columns) + ["comment[modification parameters]"] * len(mod_columns.columns) +# adding modification columns +colnames = list(sdrf_content.columns) + ["comment[modification parameters]"] * len(mod_columns.columns) sdrf_content = pd.concat([sdrf_content, mod_columns], axis=1) +sdrf_content.columns = colnames + +sdrf_content.dropna(how='all', axis=1, inplace=True) print("--- Writing sdrf file into sdrf_local.tsv ---") -sdrf_content.to_csv("sdrf_local.tsv", sep="\t", header=colnames, index=False) +# sdrf_content.to_csv("sdrf_local.tsv", sep="\t", header=colnames, index=False) +sdrf_content.to_csv("sdrf_local.tsv", sep="\t") -## Verify with sdrf-parser +# Verify with sdrf-parser check_sdrf = SdrfDataFrame() check_sdrf.parse("sdrf_local.tsv") check_sdrf.validate("mass_spectrometry") - print("########## SUMMARY #########") print("--- The following parameters have been overwritten in the sdrf file: ---") -for p in overwritten : - print(p) +for p in overwritten: + print(p)