From 2f99f734dff7bc327522252ed6ed06335b24da54 Mon Sep 17 00:00:00 2001
From: veitveit <veitveit@gmail.com>
Date: Thu, 27 Jan 2022 18:27:59 +0100
Subject: [PATCH 1/8] added various data analysis parameters

---
 sdrf_pipelines/maxquant/maxquant.py       | 218 +++++++++++++++------
 sdrf_pipelines/sdrf_merge/param2sdrf.yml  | 223 ++++++++++++++++++++++
 sdrf_pipelines/sdrf_merge/params2sdrf.yml | 223 ++++++++++++++++++++++
 sdrf_pipelines/sdrf_merge/sdrf-mapper.py  | 174 +++++++++++++++++
 setup.py                                  |   2 +-
 5 files changed, 778 insertions(+), 62 deletions(-)
 create mode 100644 sdrf_pipelines/sdrf_merge/param2sdrf.yml
 create mode 100644 sdrf_pipelines/sdrf_merge/params2sdrf.yml
 create mode 100644 sdrf_pipelines/sdrf_merge/sdrf-mapper.py

diff --git a/sdrf_pipelines/maxquant/maxquant.py b/sdrf_pipelines/maxquant/maxquant.py
index df7f0cb..5852ae4 100644
--- a/sdrf_pipelines/maxquant/maxquant.py
+++ b/sdrf_pipelines/maxquant/maxquant.py
@@ -11,6 +11,7 @@
 from xml.dom.minidom import parse
 from xml.dom.minidom import Document
 from datetime import datetime
+import yaml
 import os
 import numpy as np
 import pkg_resources
@@ -22,6 +23,8 @@ def __init__(self) -> None:
         super().__init__()
         self.warnings = dict()
         self.modfile = pkg_resources.resource_filename(__name__, "modifications.xml")
+        self.datparamfile = pkg_resources.resource_filename(__name__, "param2sdrf.yml")
+        print(self.datparamfile)
 
     def create_new_mods(self, mods, mqconfdir):
         i = 0
@@ -645,6 +648,14 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
         sdrf = sdrf.astype(str)
         sdrf.columns = map(str.lower, sdrf.columns)  # convert column names to lower-case
 
+        with open(self.datparamfile) as file:
+            param_mapping = yaml.load(file, Loader=yaml.FullLoader)
+            mapping = param_mapping["parameters"]
+        datparams = dict()
+        for i in mapping:
+            datparams[i["sdrf"]] = i["name"]
+
+
         # map filename to tuple of [fixed, variable] mods
         mod_cols = [c for ind, c in enumerate(sdrf) if
                     c.startswith('comment[modification parameters')]  # columns with modification parameters
@@ -670,6 +681,11 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
         source_name2n_reps = dict()
         file2technical_rep = dict()
         file2instrument = dict()
+        # New parameters from extended SDRF (including data analysis parameters)
+        file2params = dict()
+        for p in datparams.values():
+            file2params[p] = dict()
+
 
         if mqconfdir:
             self.create_new_mods(sdrf[mod_cols], mqconfdir)
@@ -760,6 +776,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
                 file2fragtol[raw] = "20"
                 file2fragtolunit[raw] = "ppm"
 
+            # STILL NOT IMPLEMENTED!
             if 'comment[dissociation method]' in row:
                 if row['comment[dissociation method]'] == 'not available':
                     file2diss[raw] = 'HCD'
@@ -780,6 +797,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
             else:
                 file2technical_rep[raw] = "1"
 
+                
             # store highest replicate number for this source name
             if source_name in source_name2n_reps:
                 source_name2n_reps[source_name] = max(int(source_name2n_reps[source_name]),
@@ -999,6 +1017,15 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
 
                 file2label[raw] = lt
 
+            # Reading data analysis parameters
+            for p in datparams.keys():
+                comment_p = 'comment[' + p + ']'
+                if comment_p in row:
+                   file2params[datparams[p]][raw] = row[comment_p]
+                
+
+                
+                
         # create maxquant parameters xml file
         doc = Document()
 
@@ -1144,7 +1171,14 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
         root.appendChild(secondPeptide)
 
         matchBetweenRuns_node = doc.createElement('matchBetweenRuns')
-        matchBetweenRuns_node.appendChild(doc.createTextNode(matchBetweenRuns))
+        if (len(file2params["match_between_runs"]) > 0):
+           first = list(file2params["match_between_runs"].values())[0]
+           matchBetweenRuns_node.appendChild(doc.createTextNode(first))
+           if(len(set(file2params["match_between_runs"].values())) > 1):
+                warning_message = "multiple values for match between runs, taking the first: " + first
+                self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1
+        else:
+            matchBetweenRuns_node.appendChild(doc.createTextNode(matchBetweenRuns))
         root.appendChild(matchBetweenRuns_node)
 
         matchUnidentifiedFeatures = doc.createElement('matchUnidentifiedFeatures')
@@ -1259,23 +1293,62 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
         root.appendChild(intensityPredictionsFile)
 
         minPepLen = doc.createElement('minPepLen')
-        minPepLen.appendChild(doc.createTextNode('7'))
+        tparam = file2params["min_peptide_length"]
+        if (len(tparam) > 0):
+           first = list(tparam.values())[0]
+           minPepLen.appendChild(doc.createTextNode(first))
+           if(len(set(tparam.values())) > 1):
+                warning_message = "multiple values for parameter minimum peptide length, taking the first: " + first
+                self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1
+        else:
+            minPepLen.appendChild(doc.createTextNode('7'))  
         root.appendChild(minPepLen)
 
+
         psmFdrCrosslink = doc.createElement('psmFdrCrosslink')
         psmFdrCrosslink.appendChild(doc.createTextNode('0.01'))
         root.appendChild(psmFdrCrosslink)
 
         peptideFdr = doc.createElement('peptideFdr')
-        peptideFdr.appendChild(doc.createTextNode(str(peptideFDR)))
+        tparam = file2params["fdr_peptide"]
+        if (len(tparam) > 0):
+           first = list(tparam.values())[0]
+           warning_message = "overwriting pepptide FDR using the value in the sdrf file"
+           self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1
+           peptideFdr.appendChild(doc.createTextNode(first))
+           if(len(set(tparam.values())) > 1):
+                warning_message = "multiple values for parameter Peptide FDR, taking the first: " + first
+                self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1
+        else:
+            peptideFdr.appendChild(doc.createTextNode(str(peptideFDR)))
         root.appendChild(peptideFdr)
 
         proteinFdr = doc.createElement('proteinFdr')
-        proteinFdr.appendChild(doc.createTextNode(str(proteinFDR)))
+        tparam = file2params["fdr_protein"] 
+        if (len(tparam) > 0):
+           first = list(tparam.values())[0]
+           warning_message = "overwriting protein FDR using the value in the sdrf file"
+           self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1
+           proteinFdr.appendChild(doc.createTextNode(first))
+           if(len(set(tparam.values())) > 1):
+                warning_message = "multiple values for parameter Protein FDR, taking the first: " + first
+                self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1
+        else:
+            proteinFdr.appendChild(doc.createTextNode(str(proteinFDR)))
         root.appendChild(proteinFdr)
 
         siteFdr = doc.createElement('siteFdr')
-        siteFdr.appendChild(doc.createTextNode('0.01'))
+        tparam = file2params["fdr_psm"] 
+        if (len(tparam) > 0):
+           first = list(tparam.values())[0]
+           warning_message = "overwriting PSM FDR using the value in the sdrf file"
+           self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1
+           siteFdr.appendChild(doc.createTextNode(first))
+           if(len(set(tparam.values())) > 1):
+                warning_message = "multiple values for parameter PSM FDR, taking the first: " + first
+                self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1
+        else:
+            siteFdr.appendChild(doc.createTextNode('0.01'))
         root.appendChild(siteFdr)
 
         minPeptideLengthForUnspecificSearch = doc.createElement('minPeptideLengthForUnspecificSearch')
@@ -1291,7 +1364,15 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
         root.appendChild(useNormRatiosForOccupancy)
 
         minPeptides = doc.createElement('minPeptides')
-        minPeptides.appendChild(doc.createTextNode('1'))
+        tparam = file2params["min_num_peptides"] 
+        if (len(tparam) > 0):
+           first = list(tparam.values())[0]
+           minPeptides.appendChild(doc.createTextNode(first))
+           if(len(set(tparam.values())) > 1):
+                warning_message = "multiple values for parameter minimum number of peptides, taking the first: " + first
+                self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1
+        else:
+            minPeptides.appendChild(doc.createTextNode('1'))
         root.appendChild(minPeptides)
 
         minRazorPeptides = doc.createElement('minRazorPeptides')
@@ -1362,8 +1443,23 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
         compositionPrediction.appendChild(doc.createTextNode('0'))
         root.appendChild(compositionPrediction)
 
+
         quantMode = doc.createElement('quantMode')
-        quantMode.appendChild(doc.createTextNode('1'))
+        tparam = file2params["quantification_method"] 
+        if (len(tparam) > 0):
+           first = list(tparam.values())[0]
+           if first == "unique":
+               first = '2'
+           elif first == "shared":
+               first = '0'
+           else:
+               first = '1'
+           quantMode.appendChild(doc.createTextNode(first))
+           if(len(set(tparam.values())) > 1):
+                warning_message = "multiple values for parameter Quantification mode, taking the first: " + first
+                self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1
+        else:
+            quantMode.appendChild(doc.createTextNode('1'))
         root.appendChild(quantMode)
 
         massDifferenceMods = doc.createElement('massDifferenceMods')
@@ -1536,23 +1632,23 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
         referenceChannel = doc.createElement('referenceChannel')
 
         for key1, instr_val in file2instrument.items():
-            value2 = str(file2enzyme[key1]) + file2label[key1] + str(file2mods[key1]) + str(file2pctol) \
-                     + str(file2fragtol)
-
+            value2 = str(file2enzyme[key1]) + file2label[key1] + str(file2mods[key1]) + str(file2pctol) + str(file2fragtol)
+            datanalysisparams = dict()
+            for p in file2params.keys():
+                if (len(file2params[p]) > 0):
+                    datanalysisparams[p] = file2params[p][key1]
+                
+            
             if tag == 0 and tmp == []:
                 int_node = doc.createElement('int')
                 int_node.appendChild(doc.createTextNode('0'))
                 tmp.append({instr_val: value2})
+                parameterGroup['0'] = {'instrument': file2instrument[key1], 'label': file2label[key1] , 'mods': file2mods[key1], 'enzyme': file2enzyme[key1], 'pctol': file2pctol[key1],
+                                       'fragtol': file2fragtol[key1],'pctolunit': file2pctolunit[key1], 'fragtolunit': file2fragtolunit[key1], 'datanalysisparams': datanalysisparams }
                 if 'Lys8' in file2label[key1] or 'Arg10' in file2label[key1] or 'Arg6' in \
-                        file2label[key1] or 'Lys6' in file2label[key1]:
-                    parameterGroup['0'] = [file2instrument[key1], file2label[key1], file2mods[key1], file2enzyme[key1],
-                                           file2silac_shape[key1], file2pctol[key1], file2fragtol[key1],
-                                           file2pctolunit[key1], file2fragtolunit[key1]]
-                else:
-                    parameterGroup['0'] = [file2instrument[key1], file2label[key1], file2mods[key1], file2enzyme[key1],
-                                           file2pctol[key1], file2fragtol[key1], file2pctolunit[key1],
-                                           [file2fragtolunit[key1]]]
-
+                   file2label[key1] or 'Lys6' in file2label[key1]:
+                     parameterGroup['0']['silac_shape'] = file2silac_shape[key1]
+ 
             elif {instr_val: value2} in tmp:
                 int_node = doc.createElement('int')
                 int_text = doc.createTextNode(str(tag))
@@ -1564,16 +1660,13 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
                 int_text = doc.createTextNode(str(tag))
                 int_node.appendChild(int_text)
                 tmp.append({instr_val: value2})
+                parameterGroup[str(tag)] = {'instrument': file2instrument[key1], 'label': file2label[key1] , 'mods': file2mods[key1], 'enzyme': file2enzyme[key1],
+                                            'pctol': file2pctol[key1], 'fragtol': file2fragtol[key1],'pctolunit': file2pctolunit[key1], 'fragtolunit': file2fragtolunit[key1], 'datparams': datparams}
+ 
                 if 'Lys8' in file2label[key1] or 'Arg10' in file2label[key1] or 'Arg6' in file2label[key1] \
                         or 'Lys6' in file2label[key1]:
-                    parameterGroup[str(tag)] = [file2instrument[key1], file2label[key1], file2mods[key1],
-                                                file2enzyme[key1], file2silac_shape[key1],
-                                                file2pctol[key1], file2fragtol[key1], file2pctolunit[key1],
-                                                file2fragtolunit[key1]]
-                else:
-                    parameterGroup[str(tag)] = [file2instrument[key1], file2label[key1], file2mods[key1],
-                                                file2enzyme[key1], file2pctol[key1], file2fragtol[key1],
-                                                file2pctolunit[key1], file2fragtolunit[key1]]
+                    parameterGroup[str(tag)]['silac_shape'] = file2silac_shape[key1]
+
 
             paramGroupIndices.appendChild(int_node)
 
@@ -1600,7 +1693,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
         for i, j in parameterGroup.items():
             parameterGroup = doc.createElement('parameterGroup')
             msInstrument = doc.createElement('msInstrument')
-            if 'Bruker Q-TOF' == j[0]:
+            if 'Bruker Q-TOF' == j['instrument']:
                 msInstrument.appendChild(doc.createTextNode('1'))
                 maxCharge = doc.createElement('maxCharge')
                 maxCharge.appendChild(doc.createTextNode('5'))
@@ -1624,7 +1717,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
                 advancedPeakSplitting.appendChild(doc.createTextNode('True'))
                 intensityThreshold = doc.createElement('intensityThreshold')
                 intensityThreshold.appendChild(doc.createTextNode('30'))
-            elif 'AB Sciex Q-TOF' == j[0]:
+            elif 'AB Sciex Q-TOF' == j["instrument"]:
                 msInstrument.appendChild(doc.createTextNode('2'))
                 maxCharge = doc.createElement('maxCharge')
                 maxCharge.appendChild(doc.createTextNode('5'))
@@ -1648,7 +1741,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
                 advancedPeakSplitting.appendChild(doc.createTextNode('True'))
                 intensityThreshold = doc.createElement('intensityThreshold')
                 intensityThreshold.appendChild(doc.createTextNode('0'))
-            elif 'Agilent Q-TOF' == j[0]:
+            elif 'Agilent Q-TOF' == j["instrument"]:
                 msInstrument.appendChild(doc.createTextNode('3'))
                 maxCharge = doc.createElement('maxCharge')
                 maxCharge.appendChild(doc.createTextNode('5'))
@@ -1672,7 +1765,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
                 advancedPeakSplitting.appendChild(doc.createTextNode('True'))
                 intensityThreshold = doc.createElement('intensityThreshold')
                 intensityThreshold.appendChild(doc.createTextNode('0'))
-            elif 'Bruker TIMS' == j[0]:
+            elif 'Bruker TIMS' == j["instrument"]:
                 msInstrument.appendChild(doc.createTextNode('4'))
                 maxCharge = doc.createElement('maxCharge')
                 maxCharge.appendChild(doc.createTextNode('4'))
@@ -1738,9 +1831,9 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
             isotopeValleyFactor = doc.createElement('isotopeValleyFactor')
             isotopeValleyFactor.appendChild(doc.createTextNode('1.2'))
             labelMods = doc.createElement('labelMods')
-            if 'Lys8' in j[1] or 'Arg10' in j[1] or 'Arg6' in j[1] or 'Lys6' in j[1]:
-                for lm in range(j[4][0]):
-                    r = j[1].split(',')[lm * j[4][1]:lm * j[4][1] + lm * j[4][1]]
+            if 'Lys8' in j["label"] or 'Arg10' in j["label"] or 'Arg6' in j["label"] or 'Lys6' in j["label"]:
+                for lm in range(j["silac_shape"][0]):
+                    r = j["label"].split(',')[lm * j["silac_shape"][1]:lm * j["silac_shape"][1] + lm * j["silac_shape"][1]]
                     if 'Arg0' in r:
                         r.remove('Arg0')
                     text = ';'.join(r)
@@ -1753,7 +1846,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
                 string.appendChild(doc.createTextNode(''))
                 labelMods.appendChild(string)
             lcmsRunType = doc.createElement('lcmsRunType')
-            if "TMT" in j[1] or 'iTRAQ' in j[1]:
+            if "TMT" in j["label"] or 'iTRAQ' in j["label"]:
                 lcmsRunType.appendChild(doc.createTextNode('Reporter ion MS2'))
             else:
                 lcmsRunType.appendChild(doc.createTextNode('Standard'))
@@ -1762,7 +1855,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
 
             # create label subnode
             lfqMode = doc.createElement('lfqMode')
-            if j[1] == 'label free sample':
+            if j["label"] == 'label free sample':
                 lfqMode.appendChild(doc.createTextNode('1'))
             else:
                 lfqMode.appendChild(doc.createTextNode('0'))
@@ -1791,14 +1884,26 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
             lfqMinRatioCount.appendChild(doc.createTextNode('2'))
             maxLabeledAa = doc.createElement('maxLabeledAa')
             multiplicity = doc.createElement('multiplicity')
-            if 'Lys8' in j[1] or 'Arg10' in j[1] or 'Arg6' in j[1] or 'Lys6' in j[1]:
+            if 'Lys8' in j["label"] or 'Arg10' in j["label"] or 'Arg6' in j["label"] or 'Lys6' in j["label"]:
                 maxLabeledAa.appendChild(doc.createTextNode('3'))
-                multiplicity.appendChild(doc.createTextNode(str(j[4][0])))
+                multiplicity.appendChild(doc.createTextNode(str(j["silac_shape"][0])))
             else:
                 maxLabeledAa.appendChild(doc.createTextNode('0'))
                 multiplicity.appendChild(doc.createTextNode('1'))
+
+
+
             maxNmods = doc.createElement('maxNmods')
-            maxNmods.appendChild(doc.createTextNode('5'))
+            if ("max_mods" in datanalysisparams):
+                print(len(set(datanalysisparams["max_mods"])) > 1)
+                first = datanalysisparams["max_mods"]
+                maxNmods.appendChild(doc.createTextNode(first))
+
+            else:
+                maxNmods.appendChild(doc.createTextNode('5'))
+
+
+            
             maxMissedCleavages = doc.createElement('maxMissedCleavages')
             maxMissedCleavages.appendChild(doc.createTextNode('2'))
             enzymeMode = doc.createElement('enzymeMode')
@@ -1815,8 +1920,8 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
             variableModifications = doc.createElement('variableModifications')
             fixedM_list = []
             Variable_list = []
-            fixedM_list.extend(j[2][0].split(','))
-            Variable_list.extend(j[2][1].split(','))
+            fixedM_list.extend(j["mods"][0].split(','))
+            Variable_list.extend(j["mods"][1].split(','))
             fixedM_list = list(set(fixedM_list))
             Variable_list = list(set(Variable_list))
             for F in fixedM_list:
@@ -1832,9 +1937,9 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
 
             # create enzymes subnode
             enzymes_node = doc.createElement('enzymes')
-            for index in range(len(j[3])):
+            for index in range(len(j["enzyme"])):
                 string = doc.createElement('string')
-                string.appendChild(doc.createTextNode(j[3][index]))
+                string.appendChild(doc.createTextNode(j["enzyme"][index]))
                 enzymes_node.appendChild(string)
             enzymesFirstSearch = doc.createElement('enzymesFirstSearch')
             enzymesFirstSearch.appendChild(doc.createTextNode(''))
@@ -1851,8 +1956,8 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
             multiModifications = doc.createElement('multiModifications')
             multiModifications.appendChild(doc.createTextNode(''))
             isobaricLabels = doc.createElement('isobaricLabels')
-            if "TMT" in j[1]:
-                for t in j[1].split(','):
+            if "TMT" in j["label"]:
+                for t in j["label"].split(','):
                     IsobaricLabelInfo = doc.createElement('IsobaricLabelInfo')
                     internalLabel = doc.createElement('internalLabel')
                     internalLabel.appendChild(doc.createTextNode(t))
@@ -1876,8 +1981,8 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
                     IsobaricLabelInfo.appendChild(correctionFactorP2)
                     IsobaricLabelInfo.appendChild(tmtLike)
                     isobaricLabels.appendChild(IsobaricLabelInfo)
-            elif "iTRAQ" in j[1]:
-                for t in j[1].split(','):
+            elif "iTRAQ" in j["label"]:
+                for t in j["label"].split(','):
                     IsobaricLabelInfo = doc.createElement('IsobaricLabelInfo')
                     internalLabel = doc.createElement('internalLabel')
                     internalLabel.appendChild(doc.createTextNode(t))
@@ -1918,25 +2023,16 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
             doMassFiltering.appendChild(doc.createTextNode('True'))
             firstSearchTol = doc.createElement('firstSearchTol')
             mainSearchTol = doc.createElement('mainSearchTol')
-            if len(j) == 8:
-                firstSearchTol.appendChild(doc.createTextNode(str(j[5])))
-                mainSearchTol.appendChild(doc.createTextNode(str(j[4])))
-                if j[6] == 'ppm':
+            firstSearchTol.appendChild(doc.createTextNode(str(j["fragtol"])))
+            mainSearchTol.appendChild(doc.createTextNode(str(j["pctol"])))
+            if j["pctolunit"] == 'ppm':
                     searchTolInPpm = doc.createElement('searchTolInPpm')
                     searchTolInPpm.appendChild(doc.createTextNode('True'))
-                else:
-                    searchTolInPpm = doc.createElement('searchTolInPpm')
-                    searchTolInPpm.appendChild(doc.createTextNode('False'))
             else:
-                firstSearchTol.appendChild(doc.createTextNode(str(j[6])))
-                mainSearchTol.appendChild(doc.createTextNode(str(j[5])))
-                if j[7] == 'ppm':
-                    searchTolInPpm = doc.createElement('searchTolInPpm')
-                    searchTolInPpm.appendChild(doc.createTextNode('True'))
-                else:
                     searchTolInPpm = doc.createElement('searchTolInPpm')
                     searchTolInPpm.appendChild(doc.createTextNode('False'))
             isotopeMatchTol = doc.createElement('isotopeMatchTol')
+            
             isotopeMatchTol.appendChild(doc.createTextNode('2'))
             isotopeMatchTolInPpm = doc.createElement('isotopeMatchTolInPpm')
             isotopeMatchTolInPpm.appendChild(doc.createTextNode('True'))
@@ -1967,7 +2063,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
             filterPif = doc.createElement('filterPif')
             reporterFraction = doc.createElement('reporterFraction')
             reporterBasePeakRatio = doc.createElement('reporterBasePeakRatio')
-            if "TMT" in j[1]:
+            if "TMT" in j["label"]:
                 reporterMassTolerance.appendChild(doc.createTextNode('0.003'))
                 reporterPif.appendChild(doc.createTextNode('0'))
                 filterPif.appendChild(doc.createTextNode('False'))
diff --git a/sdrf_pipelines/sdrf_merge/param2sdrf.yml b/sdrf_pipelines/sdrf_merge/param2sdrf.yml
new file mode 100644
index 0000000..07e6e97
--- /dev/null
+++ b/sdrf_pipelines/sdrf_merge/param2sdrf.yml
@@ -0,0 +1,223 @@
+name: param2sdrf
+
+description: Mapping of parameters to run workflows with sdrf headers, including default values
+
+parameters:
+- name: fixed_mods
+  type: ontology
+  sdrf: modification parameters
+  default: NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4
+  comment: only as unimod names with residue(s) separate via comma
+
+- name: variable_mods
+  type: ontology
+  sdrf: modification parameters
+  default: NT=oxidation;MT=variable;TA=M;AC=UNIMOD:35
+  comment: only as unimod names with residue(s)
+
+- name: precursor_mass_tolerance
+  type: string
+  sdrf: precursor mass tolerance
+  default: 30 ppm
+
+- name: fragment_mass_tolerance
+  type: string
+  sdrf: fragment mass tolerance
+  default: 0.05 Da
+
+- name: enzyme
+  type: ontology
+  sdrf: cleavage agent details
+  default: Trypsin
+
+#- name: type_cleavage
+#  type: class
+#  sdrf: type of cleavage
+#  default: 
+
+- name: fions
+  type: class
+  sdrf: forward ions
+  default: b
+  value:
+     - a
+     - a1
+     - b
+     - c
+     - c1
+     - a1
+
+- name: rions
+  type: class
+  sdrf: reverse ions
+  default: y
+  value:
+     - x
+     - y
+     - z
+     - z1
+     - z2
+
+- name: isotope_error_range
+  type: integer
+  sdrf: isotope error range
+  default: 0
+
+- name: add_decoys
+  type: boolean
+  sdrf: add decoys
+  default: true
+
+- name: num_hits
+  type: integer
+  sdrf: num peptide hits
+  default: 1
+
+- name: miscleavages
+  type: integer
+  sdrf: allowed miscleavages
+  default: 1
+
+- name: min_precursor_charge
+  type: integer
+  sdrf: minimum precursor charge
+  default: 2
+
+- name: max_precursor_charge
+  type: integer
+  sdrf: maximum precursor charge
+  default: 3
+
+- name: min_peptide_length
+  type: integer
+  sdrf: minimum peptide length
+  default: 8
+
+- name: max_peptide_length
+  type: integer
+  sdrf: maximum peptide length
+  default: 12
+
+- name: max_mods
+  type: integer
+  sdrf: maximum allowed modification
+  default: 4
+
+- name: fdr_psm
+  type: float
+  sdrf: fdr on psm level
+  default: 0.01
+
+- name: fdr_peptide
+  type: float
+  sdrf: fdr on peptide level
+  default: 0.01
+
+- name: fdr_protein
+  type: float
+  sdrf: fdr on protein level
+  default: 0.01
+
+#- name: match_between_runs_rt_tol
+#- type: float
+#- sdrf: retention time tolerance match between runs
+#- default: TODO
+
+#- name: match_between_runs_mz_tol
+#- type: float
+#- sdrf: m over z tolerance match between runs
+#- default: TODO
+
+- name: match_between_runs
+  type: boolean
+  sdrf: run match between runs
+  default: true
+
+- name: protein_inference
+  type: class
+  sdrf: protein inference method
+  default: unique
+  value:
+    - unique
+    - shared
+    - parsimony
+    - other
+
+- name: quantification_method
+  type: class
+  sdrf: quantification method
+  default: intensity
+  value:
+    - precursor
+    - ms2
+    - labelled
+    - other
+
+- name: summarization_proteins
+  type: class
+  sdrf: summarization of proteins method
+  default: sum_abs
+  value:
+   - sum_abs
+   - mean
+   - median
+   - other
+  comment: sum_abs is the only one on non-transformed data
+
+- name: min_num_peptides
+  type: integer
+  sdrf: minimum number of peptides per protein
+  default: 2
+
+- name: summarization_psms
+  type: class
+  sdrf: summarization of psms method
+  default: sum_abs
+  value:
+    - sum_abs
+    - mean
+    - median
+    - other
+
+
+- name: quant_transformation
+  type: class
+  sdrf: transformation of quantitative values
+  default: log
+  value:
+    - log
+    - rlog
+    - none
+    - other
+
+- name: normalization_method
+  type: class
+  sdrf: normalization method
+  default: median
+  value:
+    - mean
+    - median
+    - quantile
+    - linear
+    - non-linear
+    - other
+
+- name: run_statistics
+  type: boolean
+  sdrf: run statistical tests
+  default: true
+
+- name: fdr_method
+  type: class
+  sdrf: method for correction of multiple testing
+  default: benjamini-hochberg
+  value:
+    - benjamini-hochberg
+    - bonferroni
+    - qvalue
+    - other
+
+- name: fdr_threshold
+  type: float
+  sdrf: threshold for statistical test fdr
+  default: 0.01
diff --git a/sdrf_pipelines/sdrf_merge/params2sdrf.yml b/sdrf_pipelines/sdrf_merge/params2sdrf.yml
new file mode 100644
index 0000000..07e6e97
--- /dev/null
+++ b/sdrf_pipelines/sdrf_merge/params2sdrf.yml
@@ -0,0 +1,223 @@
+name: param2sdrf
+
+description: Mapping of parameters to run workflows with sdrf headers, including default values
+
+parameters:
+- name: fixed_mods
+  type: ontology
+  sdrf: modification parameters
+  default: NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4
+  comment: only as unimod names with residue(s) separate via comma
+
+- name: variable_mods
+  type: ontology
+  sdrf: modification parameters
+  default: NT=oxidation;MT=variable;TA=M;AC=UNIMOD:35
+  comment: only as unimod names with residue(s)
+
+- name: precursor_mass_tolerance
+  type: string
+  sdrf: precursor mass tolerance
+  default: 30 ppm
+
+- name: fragment_mass_tolerance
+  type: string
+  sdrf: fragment mass tolerance
+  default: 0.05 Da
+
+- name: enzyme
+  type: ontology
+  sdrf: cleavage agent details
+  default: Trypsin
+
+#- name: type_cleavage
+#  type: class
+#  sdrf: type of cleavage
+#  default: 
+
+- name: fions
+  type: class
+  sdrf: forward ions
+  default: b
+  value:
+     - a
+     - a1
+     - b
+     - c
+     - c1
+     - a1
+
+- name: rions
+  type: class
+  sdrf: reverse ions
+  default: y
+  value:
+     - x
+     - y
+     - z
+     - z1
+     - z2
+
+- name: isotope_error_range
+  type: integer
+  sdrf: isotope error range
+  default: 0
+
+- name: add_decoys
+  type: boolean
+  sdrf: add decoys
+  default: true
+
+- name: num_hits
+  type: integer
+  sdrf: num peptide hits
+  default: 1
+
+- name: miscleavages
+  type: integer
+  sdrf: allowed miscleavages
+  default: 1
+
+- name: min_precursor_charge
+  type: integer
+  sdrf: minimum precursor charge
+  default: 2
+
+- name: max_precursor_charge
+  type: integer
+  sdrf: maximum precursor charge
+  default: 3
+
+- name: min_peptide_length
+  type: integer
+  sdrf: minimum peptide length
+  default: 8
+
+- name: max_peptide_length
+  type: integer
+  sdrf: maximum peptide length
+  default: 12
+
+- name: max_mods
+  type: integer
+  sdrf: maximum allowed modification
+  default: 4
+
+- name: fdr_psm
+  type: float
+  sdrf: fdr on psm level
+  default: 0.01
+
+- name: fdr_peptide
+  type: float
+  sdrf: fdr on peptide level
+  default: 0.01
+
+- name: fdr_protein
+  type: float
+  sdrf: fdr on protein level
+  default: 0.01
+
+#- name: match_between_runs_rt_tol
+#- type: float
+#- sdrf: retention time tolerance match between runs
+#- default: TODO
+
+#- name: match_between_runs_mz_tol
+#- type: float
+#- sdrf: m over z tolerance match between runs
+#- default: TODO
+
+- name: match_between_runs
+  type: boolean
+  sdrf: run match between runs
+  default: true
+
+- name: protein_inference
+  type: class
+  sdrf: protein inference method
+  default: unique
+  value:
+    - unique
+    - shared
+    - parsimony
+    - other
+
+- name: quantification_method
+  type: class
+  sdrf: quantification method
+  default: intensity
+  value:
+    - precursor
+    - ms2
+    - labelled
+    - other
+
+- name: summarization_proteins
+  type: class
+  sdrf: summarization of proteins method
+  default: sum_abs
+  value:
+   - sum_abs
+   - mean
+   - median
+   - other
+  comment: sum_abs is the only one on non-transformed data
+
+- name: min_num_peptides
+  type: integer
+  sdrf: minimum number of peptides per protein
+  default: 2
+
+- name: summarization_psms
+  type: class
+  sdrf: summarization of psms method
+  default: sum_abs
+  value:
+    - sum_abs
+    - mean
+    - median
+    - other
+
+
+- name: quant_transformation
+  type: class
+  sdrf: transformation of quantitative values
+  default: log
+  value:
+    - log
+    - rlog
+    - none
+    - other
+
+- name: normalization_method
+  type: class
+  sdrf: normalization method
+  default: median
+  value:
+    - mean
+    - median
+    - quantile
+    - linear
+    - non-linear
+    - other
+
+- name: run_statistics
+  type: boolean
+  sdrf: run statistical tests
+  default: true
+
+- name: fdr_method
+  type: class
+  sdrf: method for correction of multiple testing
+  default: benjamini-hochberg
+  value:
+    - benjamini-hochberg
+    - bonferroni
+    - qvalue
+    - other
+
+- name: fdr_threshold
+  type: float
+  sdrf: threshold for statistical test fdr
+  default: 0.01
diff --git a/sdrf_pipelines/sdrf_merge/sdrf-mapper.py b/sdrf_pipelines/sdrf_merge/sdrf-mapper.py
new file mode 100644
index 0000000..6a649a2
--- /dev/null
+++ b/sdrf_pipelines/sdrf_merge/sdrf-mapper.py
@@ -0,0 +1,174 @@
+import pandas as pd
+import re
+import yaml
+import sdrf_pipelines
+import os.path
+from sdrf_pipelines.zooma.zooma import Zooma, OlsClient
+from sdrf_pipelines.openms.unimod import UnimodDatabase
+from sdrf_pipelines.sdrf.sdrf import SdrfDataFrame
+
+## Accessing ontologies and CVs
+unimod = UnimodDatabase()
+olsclient = OlsClient()
+#print(ols_out)
+
+# modifications have the same column name, not working with pandas
+# therefore separated
+mod_columns = pd.DataFrame()
+
+## For summary at the end
+overwritten = set()
+
+with open(r'param2sdrf.yml') as file:
+   param_mapping = yaml.load(file, Loader=yaml.FullLoader)
+   mapping = param_mapping["parameters"]
+
+
+## READ PARAMETERS FOR RUNNING WORKFLOW
+with open(r'params.yml') as file:
+   tparams_in = yaml.load(file, Loader=yaml.FullLoader)
+   params_in = tparams_in["params"]
+   rawfiles = tparams_in["rawfiles"]
+   fastafile = tparams_in["fastafile"]
+
+
+
+## WE NEED AN SDRF FILE FOR THE EXPERIMENTAL DESIGN, CONTAINING FILE LOCATIONS
+sdrf_content = pd.DataFrame()
+has_sdrf = os.path.isfile("./sdrf.tsv")
+if has_sdrf :
+   sdrf_content = pd.read_csv("sdrf.tsv", sep="\t")
+   mod_columns = sdrf_content.filter(like="comment[modification parameters]")
+   sdrf_content = sdrf_content.drop(columns=mod_columns.columns)
+   # delete columns with fixed/variable modification info
+   if "fixed_mods" in params_in.keys():
+      ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=fixed")) ]
+      mod_columns.drop(ttt, axis=1, inplace=True)
+      overwritten.add("fixed_mods")
+   if "variable_mods" in params_in.keys():
+      ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=variable")) ]
+      mod_columns.drop(ttt, axis=1, inplace=True)
+      overwritten.add("variable_mods")
+   
+
+else:
+   ## THROW ERROR FOR MISSING SDRF
+   exit("ERROR: No SDRF file given. Add an at least minimal version\nFor more details, see https://github.com/bigbio/proteomics-metadata-standard/tree/master/sdrf-proteomics")
+
+   
+## FIRST STANDARD PARAMETERS
+# FOR GIVEN PARAMETERS
+# CHECK WHETHER COLUMN IN SDRF TO PUT WARNING AND OVERWRITE
+# IF NOT GIVEN, WRITE COLUMN
+for p in mapping:
+   pname = p["name"]
+   ptype = p["type"]
+   print("---- Parameter: " + pname + ": ----")
+   if(pname in list(params_in.keys())) :
+      print("Found in parameter file")
+      pvalue = params_in[pname]
+   else:
+      print("Setting to default: " + p["default"])
+      pvalue = p["default"]
+
+   psdrf = "comment[" + p["sdrf"] + "]"
+
+
+
+   if (psdrf) in sdrf_content.keys() :
+      if (len(set(sdrf_content[psdrf])) > 1) :
+         exit("ERROR: multiple values for parameter " + pname + " in sdrf file\n We recommend separating the file into parts with the same data analysis parameters")
+      print("WARNING: Overwriting values in sdrf file with " + pvalue)
+      overwritten.add(pname)
+
+   # for each type: check consistency
+   #print(type(pvalue))
+   if ptype == "boolean" :
+       if not isinstance(pvalue, bool) :
+         exit("ERROR: " + pname + " needs to be either \"true\" or \"false\"!!")
+   if ptype == "str" :
+       if not isinstance(pvalue, str) :
+         exit("ERROR: " + pname + " needs to be a string!!")
+   if ptype == "integer" : 
+       if not isinstance(pvalue, int) :
+         exit("ERROR: " + pname + " needs to be a string!!")
+   if ptype == "float" : 
+       if not isinstance(pvalue, (float, int)) :
+         exit("ERROR: " + pname + " needs to be a numeric value!!")
+   if ptype == "class" :
+      not_matching = [x for x in pvalue.split(",") if x not in p["value"]]
+      if not_matching != [] :
+         exit("ERROR: " + pname + " needs to have one of these values: " + ' '.join(p["value"]) + "!!\n" + ' '.join(not_matching) + " did not match")
+         
+      
+   # Mass tolerances: do they include Da or ppm exclusively?
+   if pname == "fragment_mass_tolerance" or pname == "precursor_mass_tolerance" :
+      unit = pvalue.split(" ")[1] 
+      if unit != "Da" and unit != "ppm" :
+        exit("ERROR: " + pname + " allows only units of \"Da\" and \"ppm\", separated by space from the value!!\nWe found " + unit)
+
+   ## ENZYME AND MODIFICATIONS: LOOK UP ONTOLOGY VALUES
+   if pname == "enzyme" :
+    ols_out = olsclient.search(pvalue, ontology="MS", exact=True)
+    if ols_out == None or len(ols_out) > 1 :
+       exit("ERROR: enzyme " + pvalue + " not found in the MS ontology, see https://bioportal.bioontology.org/ontologies/MS/?p=classes&conceptid=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001045 for available terms")
+    pvalue = "NT=" + pvalue + ";AC=" + ols_out[0]["short_form"]
+
+       
+   ## Modifications: look up in Unimod
+   if pname == "fixed_mods" or pname == "variable_mods" :
+      mods = pvalue.split(",")
+      for m in mods :
+         tmod = m.split(" of ")
+         if len(tmod) < 2:
+            exit("ERROR: Something wrong with the modification entry " + m + ". It should be PSI_MS_NAME of RESIDUE. Note that it should be single residues")
+         modname = tmod[0]
+         modpos = tmod[1]
+         found = [x for x in unimod.modifications if modname == x.get_name()]
+         if found == [] :
+            exit("ERROR: " + m + " not found in Unimod. Check the \"PSI-MS Names\" in unimod.org. Also check whether you used space between the comma separated modifications")
+         modtype = "fixed"
+         if (pname == "variable_mods") : modtype = "variable"            
+         if re.match("[A-Z]",modpos) :
+            mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" + modtype + ";TA=" + modpos
+         elif modpos in ["Protein N-term", "Protein C-term", "Any N-term", "Any C-term"] :
+            mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" + modtype + ";PP=" + modpos
+         else :
+            exit("ERROR: Wrong residue given: " + modpos + ". Should be either one upper case letter or any of \"Protein N-term\", \"Protein C-term\", \"Any N-term\", \"Any C-term\"")
+
+
+#print(found_list[0].get_name())
+
+
+   ## Now finally writing the value
+   else : 
+     sdrf_content[psdrf] = pvalue
+         
+
+   
+
+## OVERWRITE RAW FILES IF GIVEN TO DIRECT TO THE CORRECT LOCATION?
+
+## ADD FASTA FILE TO SDRF (COMMENT:FASTA DATABASE FILE)?
+
+## WRITE EXPERIMENTAL DESIGN IF NO SDRF?
+
+
+## adding modification columns
+colnames = list(sdrf_content.columns)  + ["comment[modification parameters]"] * len(mod_columns.columns)
+
+sdrf_content = pd.concat([sdrf_content, mod_columns], axis=1)
+
+print("--- Writing sdrf file into sdrf_local.tsv ---")
+sdrf_content.to_csv("sdrf_local.tsv", sep="\t", header=colnames, index=False) 
+
+## Verify with sdrf-parser
+check_sdrf = SdrfDataFrame()
+check_sdrf.parse("sdrf_local.tsv")
+check_sdrf.validate("mass_spectrometry")
+
+
+print("########## SUMMARY #########")
+print("--- The following parameters have been overwritten in the sdrf file: ---")
+for p in overwritten :
+   print(p)
diff --git a/setup.py b/setup.py
index 61d2418..be52dc5 100644
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@
   long_description_content_type="text/markdown",
   long_description=long_description,
   license="'Apache 2.0",
-  data_files=[("", ["LICENSE", "sdrf_pipelines/openms/unimod.xml"])],
+  data_files=[("", ["LICENSE", "sdrf_pipelines/openms/unimod.xml", "sdrf_pipelines/maxquant/param2sdrf.yml"])],
   package_data={'': ['*.xml'], },
   url="https://github.com/bigbio/sdrf-pipelines",
   packages=find_packages(),

From ac1478804c6f0c3aa612865d98987d78b02f18ff Mon Sep 17 00:00:00 2001
From: veitveit <veitveit@gmail.com>
Date: Thu, 27 Jan 2022 18:29:15 +0100
Subject: [PATCH 2/8] added various data analysis parameters

---
 ...drf-mapper.py => add_datanalysis_param.py} |   0
 sdrf_pipelines/sdrf_merge/params2sdrf.yml     | 223 ------------------
 2 files changed, 223 deletions(-)
 rename sdrf_pipelines/sdrf_merge/{sdrf-mapper.py => add_datanalysis_param.py} (100%)
 delete mode 100644 sdrf_pipelines/sdrf_merge/params2sdrf.yml

diff --git a/sdrf_pipelines/sdrf_merge/sdrf-mapper.py b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
similarity index 100%
rename from sdrf_pipelines/sdrf_merge/sdrf-mapper.py
rename to sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
diff --git a/sdrf_pipelines/sdrf_merge/params2sdrf.yml b/sdrf_pipelines/sdrf_merge/params2sdrf.yml
deleted file mode 100644
index 07e6e97..0000000
--- a/sdrf_pipelines/sdrf_merge/params2sdrf.yml
+++ /dev/null
@@ -1,223 +0,0 @@
-name: param2sdrf
-
-description: Mapping of parameters to run workflows with sdrf headers, including default values
-
-parameters:
-- name: fixed_mods
-  type: ontology
-  sdrf: modification parameters
-  default: NT=Carbamidomethyl;TA=C;MT=fixed;AC=UNIMOD:4
-  comment: only as unimod names with residue(s) separate via comma
-
-- name: variable_mods
-  type: ontology
-  sdrf: modification parameters
-  default: NT=oxidation;MT=variable;TA=M;AC=UNIMOD:35
-  comment: only as unimod names with residue(s)
-
-- name: precursor_mass_tolerance
-  type: string
-  sdrf: precursor mass tolerance
-  default: 30 ppm
-
-- name: fragment_mass_tolerance
-  type: string
-  sdrf: fragment mass tolerance
-  default: 0.05 Da
-
-- name: enzyme
-  type: ontology
-  sdrf: cleavage agent details
-  default: Trypsin
-
-#- name: type_cleavage
-#  type: class
-#  sdrf: type of cleavage
-#  default: 
-
-- name: fions
-  type: class
-  sdrf: forward ions
-  default: b
-  value:
-     - a
-     - a1
-     - b
-     - c
-     - c1
-     - a1
-
-- name: rions
-  type: class
-  sdrf: reverse ions
-  default: y
-  value:
-     - x
-     - y
-     - z
-     - z1
-     - z2
-
-- name: isotope_error_range
-  type: integer
-  sdrf: isotope error range
-  default: 0
-
-- name: add_decoys
-  type: boolean
-  sdrf: add decoys
-  default: true
-
-- name: num_hits
-  type: integer
-  sdrf: num peptide hits
-  default: 1
-
-- name: miscleavages
-  type: integer
-  sdrf: allowed miscleavages
-  default: 1
-
-- name: min_precursor_charge
-  type: integer
-  sdrf: minimum precursor charge
-  default: 2
-
-- name: max_precursor_charge
-  type: integer
-  sdrf: maximum precursor charge
-  default: 3
-
-- name: min_peptide_length
-  type: integer
-  sdrf: minimum peptide length
-  default: 8
-
-- name: max_peptide_length
-  type: integer
-  sdrf: maximum peptide length
-  default: 12
-
-- name: max_mods
-  type: integer
-  sdrf: maximum allowed modification
-  default: 4
-
-- name: fdr_psm
-  type: float
-  sdrf: fdr on psm level
-  default: 0.01
-
-- name: fdr_peptide
-  type: float
-  sdrf: fdr on peptide level
-  default: 0.01
-
-- name: fdr_protein
-  type: float
-  sdrf: fdr on protein level
-  default: 0.01
-
-#- name: match_between_runs_rt_tol
-#- type: float
-#- sdrf: retention time tolerance match between runs
-#- default: TODO
-
-#- name: match_between_runs_mz_tol
-#- type: float
-#- sdrf: m over z tolerance match between runs
-#- default: TODO
-
-- name: match_between_runs
-  type: boolean
-  sdrf: run match between runs
-  default: true
-
-- name: protein_inference
-  type: class
-  sdrf: protein inference method
-  default: unique
-  value:
-    - unique
-    - shared
-    - parsimony
-    - other
-
-- name: quantification_method
-  type: class
-  sdrf: quantification method
-  default: intensity
-  value:
-    - precursor
-    - ms2
-    - labelled
-    - other
-
-- name: summarization_proteins
-  type: class
-  sdrf: summarization of proteins method
-  default: sum_abs
-  value:
-   - sum_abs
-   - mean
-   - median
-   - other
-  comment: sum_abs is the only one on non-transformed data
-
-- name: min_num_peptides
-  type: integer
-  sdrf: minimum number of peptides per protein
-  default: 2
-
-- name: summarization_psms
-  type: class
-  sdrf: summarization of psms method
-  default: sum_abs
-  value:
-    - sum_abs
-    - mean
-    - median
-    - other
-
-
-- name: quant_transformation
-  type: class
-  sdrf: transformation of quantitative values
-  default: log
-  value:
-    - log
-    - rlog
-    - none
-    - other
-
-- name: normalization_method
-  type: class
-  sdrf: normalization method
-  default: median
-  value:
-    - mean
-    - median
-    - quantile
-    - linear
-    - non-linear
-    - other
-
-- name: run_statistics
-  type: boolean
-  sdrf: run statistical tests
-  default: true
-
-- name: fdr_method
-  type: class
-  sdrf: method for correction of multiple testing
-  default: benjamini-hochberg
-  value:
-    - benjamini-hochberg
-    - bonferroni
-    - qvalue
-    - other
-
-- name: fdr_threshold
-  type: float
-  sdrf: threshold for statistical test fdr
-  default: 0.01

From fd97ffded714d409620cbffd4c5cee07338223db Mon Sep 17 00:00:00 2001
From: veitveit <veitveit@gmail.com>
Date: Fri, 28 Jan 2022 12:47:50 +0100
Subject: [PATCH 3/8] corrected style

---
 sdrf_pipelines/maxquant/maxquant.py           | 14 ++++----
 .../sdrf_merge/add_datanalysis_param.py       | 35 +++++++++----------
 setup.py                                      |  2 +-
 3 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/sdrf_pipelines/maxquant/maxquant.py b/sdrf_pipelines/maxquant/maxquant.py
index 5852ae4..c23db2a 100644
--- a/sdrf_pipelines/maxquant/maxquant.py
+++ b/sdrf_pipelines/maxquant/maxquant.py
@@ -649,7 +649,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
         sdrf.columns = map(str.lower, sdrf.columns)  # convert column names to lower-case
 
         with open(self.datparamfile) as file:
-            param_mapping = yaml.load(file, Loader=yaml.FullLoader)
+            param_mapping = yaml.safe_load(file)
             mapping = param_mapping["parameters"]
         datparams = dict()
         for i in mapping:
@@ -797,7 +797,6 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
             else:
                 file2technical_rep[raw] = "1"
 
-                
             # store highest replicate number for this source name
             if source_name in source_name2n_reps:
                 source_name2n_reps[source_name] = max(int(source_name2n_reps[source_name]),
@@ -1301,10 +1300,9 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
                 warning_message = "multiple values for parameter minimum peptide length, taking the first: " + first
                 self.warnings[warning_message] = self.warnings.get(warning_message, 0) + 1
         else:
-            minPepLen.appendChild(doc.createTextNode('7'))  
+            minPepLen.appendChild(doc.createTextNode('7'))
         root.appendChild(minPepLen)
 
-
         psmFdrCrosslink = doc.createElement('psmFdrCrosslink')
         psmFdrCrosslink.appendChild(doc.createTextNode('0.01'))
         root.appendChild(psmFdrCrosslink)
@@ -1324,7 +1322,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
         root.appendChild(peptideFdr)
 
         proteinFdr = doc.createElement('proteinFdr')
-        tparam = file2params["fdr_protein"] 
+        tparam = file2params["fdr_protein"]
         if (len(tparam) > 0):
            first = list(tparam.values())[0]
            warning_message = "overwriting protein FDR using the value in the sdrf file"
@@ -1338,7 +1336,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
         root.appendChild(proteinFdr)
 
         siteFdr = doc.createElement('siteFdr')
-        tparam = file2params["fdr_psm"] 
+        tparam = file2params["fdr_psm"]
         if (len(tparam) > 0):
            first = list(tparam.values())[0]
            warning_message = "overwriting PSM FDR using the value in the sdrf file"
@@ -1364,7 +1362,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
         root.appendChild(useNormRatiosForOccupancy)
 
         minPeptides = doc.createElement('minPeptides')
-        tparam = file2params["min_num_peptides"] 
+        tparam = file2params["min_num_peptides"]
         if (len(tparam) > 0):
            first = list(tparam.values())[0]
            minPeptides.appendChild(doc.createTextNode(first))
@@ -1445,7 +1443,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
 
 
         quantMode = doc.createElement('quantMode')
-        tparam = file2params["quantification_method"] 
+        tparam = file2params["quantification_method"]
         if (len(tparam) > 0):
            first = list(tparam.values())[0]
            if first == "unique":
diff --git a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
index 6a649a2..078472e 100644
--- a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
+++ b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
@@ -1,9 +1,9 @@
 import pandas as pd
 import re
 import yaml
-import sdrf_pipelines
+#import sdrf_pipelines
 import os.path
-from sdrf_pipelines.zooma.zooma import Zooma, OlsClient
+from sdrf_pipelines.zooma.zooma import OlsClient
 from sdrf_pipelines.openms.unimod import UnimodDatabase
 from sdrf_pipelines.sdrf.sdrf import SdrfDataFrame
 
@@ -20,13 +20,13 @@
 overwritten = set()
 
 with open(r'param2sdrf.yml') as file:
-   param_mapping = yaml.load(file, Loader=yaml.FullLoader)
+   param_mapping = yaml.safe_load(file)
    mapping = param_mapping["parameters"]
 
 
 ## READ PARAMETERS FOR RUNNING WORKFLOW
 with open(r'params.yml') as file:
-   tparams_in = yaml.load(file, Loader=yaml.FullLoader)
+   tparams_in = yaml.safe_load(file)
    params_in = tparams_in["params"]
    rawfiles = tparams_in["rawfiles"]
    fastafile = tparams_in["fastafile"]
@@ -49,13 +49,12 @@
       ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=variable")) ]
       mod_columns.drop(ttt, axis=1, inplace=True)
       overwritten.add("variable_mods")
-   
 
 else:
    ## THROW ERROR FOR MISSING SDRF
    exit("ERROR: No SDRF file given. Add an at least minimal version\nFor more details, see https://github.com/bigbio/proteomics-metadata-standard/tree/master/sdrf-proteomics")
 
-   
+
 ## FIRST STANDARD PARAMETERS
 # FOR GIVEN PARAMETERS
 # CHECK WHETHER COLUMN IN SDRF TO PUT WARNING AND OVERWRITE
@@ -83,32 +82,32 @@
 
    # for each type: check consistency
    #print(type(pvalue))
-   if ptype == "boolean" :
+   if ptype == "boolean":
        if not isinstance(pvalue, bool) :
          exit("ERROR: " + pname + " needs to be either \"true\" or \"false\"!!")
-   if ptype == "str" :
+   if ptype == "str":
        if not isinstance(pvalue, str) :
          exit("ERROR: " + pname + " needs to be a string!!")
-   if ptype == "integer" : 
+   if ptype == "integer":
        if not isinstance(pvalue, int) :
          exit("ERROR: " + pname + " needs to be a string!!")
-   if ptype == "float" : 
-       if not isinstance(pvalue, (float, int)) :
+   if ptype == "float":
+       if not isinstance(pvalue, (float, int)):
          exit("ERROR: " + pname + " needs to be a numeric value!!")
-   if ptype == "class" :
+   if ptype == "class":
       not_matching = [x for x in pvalue.split(",") if x not in p["value"]]
       if not_matching != [] :
          exit("ERROR: " + pname + " needs to have one of these values: " + ' '.join(p["value"]) + "!!\n" + ' '.join(not_matching) + " did not match")
          
       
    # Mass tolerances: do they include Da or ppm exclusively?
-   if pname == "fragment_mass_tolerance" or pname == "precursor_mass_tolerance" :
-      unit = pvalue.split(" ")[1] 
+   if pname == "fragment_mass_tolerance" or pname == "precursor_mass_tolerance":
+      unit = pvalue.split(" ")[1]
       if unit != "Da" and unit != "ppm" :
         exit("ERROR: " + pname + " allows only units of \"Da\" and \"ppm\", separated by space from the value!!\nWe found " + unit)
 
    ## ENZYME AND MODIFICATIONS: LOOK UP ONTOLOGY VALUES
-   if pname == "enzyme" :
+   if pname == "enzyme":
     ols_out = olsclient.search(pvalue, ontology="MS", exact=True)
     if ols_out == None or len(ols_out) > 1 :
        exit("ERROR: enzyme " + pvalue + " not found in the MS ontology, see https://bioportal.bioontology.org/ontologies/MS/?p=classes&conceptid=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001045 for available terms")
@@ -128,7 +127,7 @@
          if found == [] :
             exit("ERROR: " + m + " not found in Unimod. Check the \"PSI-MS Names\" in unimod.org. Also check whether you used space between the comma separated modifications")
          modtype = "fixed"
-         if (pname == "variable_mods") : modtype = "variable"            
+         if (pname == "variable_mods") : modtype = "variable"
          if re.match("[A-Z]",modpos) :
             mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" + modtype + ";TA=" + modpos
          elif modpos in ["Protein N-term", "Protein C-term", "Any N-term", "Any C-term"] :
@@ -141,7 +140,7 @@
 
 
    ## Now finally writing the value
-   else : 
+   else :
      sdrf_content[psdrf] = pvalue
          
 
@@ -160,7 +159,7 @@
 sdrf_content = pd.concat([sdrf_content, mod_columns], axis=1)
 
 print("--- Writing sdrf file into sdrf_local.tsv ---")
-sdrf_content.to_csv("sdrf_local.tsv", sep="\t", header=colnames, index=False) 
+sdrf_content.to_csv("sdrf_local.tsv", sep="\t", header=colnames, index=False)
 
 ## Verify with sdrf-parser
 check_sdrf = SdrfDataFrame()
diff --git a/setup.py b/setup.py
index be52dc5..b77a31d 100644
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@
   long_description_content_type="text/markdown",
   long_description=long_description,
   license="'Apache 2.0",
-  data_files=[("", ["LICENSE", "sdrf_pipelines/openms/unimod.xml", "sdrf_pipelines/maxquant/param2sdrf.yml"])],
+  data_files=[("", ["LICENSE", "sdrf_pipelines/openms/unimod.xml", "sdrf_pipelines/sdrf_merge/param2sdrf.yml"])],
   package_data={'': ['*.xml'], },
   url="https://github.com/bigbio/sdrf-pipelines",
   packages=find_packages(),

From 275c3d1a74681aa9f00c0937f26e5281409ba26d Mon Sep 17 00:00:00 2001
From: veitveit <veitveit@gmail.com>
Date: Fri, 11 Feb 2022 07:50:50 +0100
Subject: [PATCH 4/8] simplified loop

---
 sdrf_pipelines/maxquant/maxquant.py           |   6 +-
 .../sdrf_merge/add_datanalysis_param.py       | 107 +++++++++---------
 2 files changed, 56 insertions(+), 57 deletions(-)

diff --git a/sdrf_pipelines/maxquant/maxquant.py b/sdrf_pipelines/maxquant/maxquant.py
index c23db2a..45928d5 100644
--- a/sdrf_pipelines/maxquant/maxquant.py
+++ b/sdrf_pipelines/maxquant/maxquant.py
@@ -1021,10 +1021,10 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
                 comment_p = 'comment[' + p + ']'
                 if comment_p in row:
                    file2params[datparams[p]][raw] = row[comment_p]
-                
 
-                
-                
+
+
+
         # create maxquant parameters xml file
         doc = Document()
 
diff --git a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
index 078472e..43b87c4 100644
--- a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
+++ b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
@@ -48,8 +48,7 @@
    if "variable_mods" in params_in.keys():
       ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=variable")) ]
       mod_columns.drop(ttt, axis=1, inplace=True)
-      overwritten.add("variable_mods")
-
+      overwritten.add("variable_mods")  
 else:
    ## THROW ERROR FOR MISSING SDRF
    exit("ERROR: No SDRF file given. Add an at least minimal version\nFor more details, see https://github.com/bigbio/proteomics-metadata-standard/tree/master/sdrf-proteomics")
@@ -65,7 +64,7 @@
    print("---- Parameter: " + pname + ": ----")
    if(pname in list(params_in.keys())) :
       print("Found in parameter file")
-      pvalue = params_in[pname]
+        pvalue = params_in[pname]
    else:
       print("Setting to default: " + p["default"])
       pvalue = p["default"]
@@ -80,60 +79,60 @@
       print("WARNING: Overwriting values in sdrf file with " + pvalue)
       overwritten.add(pname)
 
-   # for each type: check consistency
-   #print(type(pvalue))
-   if ptype == "boolean":
-       if not isinstance(pvalue, bool) :
-         exit("ERROR: " + pname + " needs to be either \"true\" or \"false\"!!")
-   if ptype == "str":
-       if not isinstance(pvalue, str) :
-         exit("ERROR: " + pname + " needs to be a string!!")
-   if ptype == "integer":
-       if not isinstance(pvalue, int) :
-         exit("ERROR: " + pname + " needs to be a string!!")
-   if ptype == "float":
-       if not isinstance(pvalue, (float, int)):
-         exit("ERROR: " + pname + " needs to be a numeric value!!")
-   if ptype == "class":
-      not_matching = [x for x in pvalue.split(",") if x not in p["value"]]
-      if not_matching != [] :
-         exit("ERROR: " + pname + " needs to have one of these values: " + ' '.join(p["value"]) + "!!\n" + ' '.join(not_matching) + " did not match")
-         
-      
-   # Mass tolerances: do they include Da or ppm exclusively?
-   if pname == "fragment_mass_tolerance" or pname == "precursor_mass_tolerance":
-      unit = pvalue.split(" ")[1]
-      if unit != "Da" and unit != "ppm" :
-        exit("ERROR: " + pname + " allows only units of \"Da\" and \"ppm\", separated by space from the value!!\nWe found " + unit)
-
-   ## ENZYME AND MODIFICATIONS: LOOK UP ONTOLOGY VALUES
-   if pname == "enzyme":
-    ols_out = olsclient.search(pvalue, ontology="MS", exact=True)
-    if ols_out == None or len(ols_out) > 1 :
-       exit("ERROR: enzyme " + pvalue + " not found in the MS ontology, see https://bioportal.bioontology.org/ontologies/MS/?p=classes&conceptid=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001045 for available terms")
-    pvalue = "NT=" + pvalue + ";AC=" + ols_out[0]["short_form"]
+     # for each type: check consistency
+     #print(type(pvalue))
+     if ptype == "boolean":
+         if not isinstance(pvalue, bool) :
+           exit("ERROR: " + pname + " needs to be either \"true\" or \"false\"!!")
+     elif ptype == "str":
+         if not isinstance(pvalue, str) :
+           exit("ERROR: " + pname + " needs to be a string!!")
+     elif ptype == "integer":
+           if not isinstance(pvalue, int) :
+           exit("ERROR: " + pname + " needs to be a string!!")
+     elif ptype == "float":
+         if not isinstance(pvalue, (float, int)):
+           exit("ERROR: " + pname + " needs to be a numeric value!!")
+     elif ptype == "class":
+        not_matching = [x for x in pvalue.split(",") if x not in p["value"]]
+        if not_matching != [] :
+           exit("ERROR: " + pname + " needs to have one of these values: " + ' '.join(p["value"]) + "!!\n" + ' '.join(not_matching) + " did not match")
+
+
+     # Mass tolerances: do they include Da or ppm exclusively?
+     if pname == "fragment_mass_tolerance" or pname == "precursor_mass_tolerance":
+        unit = pvalue.split(" ")[1]
+        if unit != "Da" and unit != "ppm" :
+          exit("ERROR: " + pname + " allows only units of \"Da\" and \"ppm\", separated by space from the value!!\nWe found " + unit)
+
+     ## ENZYME AND MODIFICATIONS: LOOK UP ONTOLOGY VALUES
+     elif pname == "enzyme":
+       ols_out = olsclient.search(pvalue, ontology="MS", exact=True)
+       if ols_out == None or len(ols_out) > 1 :
+         exit("ERROR: enzyme " + pvalue + " not found in the MS ontology, see https://bioportal.bioontology.org/ontologies/MS/?p=classes&conceptid=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001045 for available terms")
+      pvalue = "NT=" + pvalue + ";AC=" + ols_out[0]["short_form"]
 
        
    ## Modifications: look up in Unimod
-   if pname == "fixed_mods" or pname == "variable_mods" :
-      mods = pvalue.split(",")
-      for m in mods :
-         tmod = m.split(" of ")
-         if len(tmod) < 2:
-            exit("ERROR: Something wrong with the modification entry " + m + ". It should be PSI_MS_NAME of RESIDUE. Note that it should be single residues")
-         modname = tmod[0]
-         modpos = tmod[1]
-         found = [x for x in unimod.modifications if modname == x.get_name()]
-         if found == [] :
-            exit("ERROR: " + m + " not found in Unimod. Check the \"PSI-MS Names\" in unimod.org. Also check whether you used space between the comma separated modifications")
-         modtype = "fixed"
-         if (pname == "variable_mods") : modtype = "variable"
-         if re.match("[A-Z]",modpos) :
-            mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" + modtype + ";TA=" + modpos
-         elif modpos in ["Protein N-term", "Protein C-term", "Any N-term", "Any C-term"] :
-            mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" + modtype + ";PP=" + modpos
-         else :
-            exit("ERROR: Wrong residue given: " + modpos + ". Should be either one upper case letter or any of \"Protein N-term\", \"Protein C-term\", \"Any N-term\", \"Any C-term\"")
+     elif pname == "fixed_mods" or pname == "variable_mods" :
+        mods = pvalue.split(",")
+        for m in mods :
+           tmod = m.split(" of ")
+           if len(tmod) < 2:
+              exit("ERROR: Something wrong with the modification entry " + m + ". It should be PSI_MS_NAME of RESIDUE. Note that it should be single residues")
+           modname = tmod[0]
+           modpos = tmod[1]
+           found = [x for x in unimod.modifications if modname == x.get_name()]
+           if found == [] :
+              exit("ERROR: " + m + " not found in Unimod. Check the \"PSI-MS Names\" in unimod.org. Also check whether you used space between the comma separated modifications")
+           modtype = "fixed"
+           if (pname == "variable_mods") : modtype = "variable"
+           if re.match("[A-Z]",modpos) :
+              mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" + modtype + ";TA=" + modpos
+           elif modpos in ["Protein N-term", "Protein C-term", "Any N-term", "Any C-term"] :
+              mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" + modtype + ";PP=" + modpos
+           else :
+              exit("ERROR: Wrong residue given: " + modpos + ". Should be either one upper case letter or any of \"Protein N-term\", \"Protein C-term\", \"Any N-term\", \"Any C-term\"")
 
 
 #print(found_list[0].get_name())

From 7d5483f9ee5406e95a1f08da279120b86c4246d5 Mon Sep 17 00:00:00 2001
From: veitveit <veitveit@gmail.com>
Date: Fri, 11 Feb 2022 07:58:37 +0100
Subject: [PATCH 5/8] simplified loop

---
 .../sdrf_merge/add_datanalysis_param.py        | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
index 43b87c4..606105d 100644
--- a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
+++ b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
@@ -64,7 +64,7 @@
    print("---- Parameter: " + pname + ": ----")
    if(pname in list(params_in.keys())) :
       print("Found in parameter file")
-        pvalue = params_in[pname]
+      pvalue = params_in[pname]
    else:
       print("Setting to default: " + p["default"])
       pvalue = p["default"]
@@ -74,10 +74,10 @@
 
 
    if (psdrf) in sdrf_content.keys() :
-      if (len(set(sdrf_content[psdrf])) > 1) :
+     if (len(set(sdrf_content[psdrf])) > 1) :
          exit("ERROR: multiple values for parameter " + pname + " in sdrf file\n We recommend separating the file into parts with the same data analysis parameters")
-      print("WARNING: Overwriting values in sdrf file with " + pvalue)
-      overwritten.add(pname)
+     print("WARNING: Overwriting values in sdrf file with " + pvalue)
+     overwritten.add(pname)
 
      # for each type: check consistency
      #print(type(pvalue))
@@ -88,15 +88,15 @@
          if not isinstance(pvalue, str) :
            exit("ERROR: " + pname + " needs to be a string!!")
      elif ptype == "integer":
-           if not isinstance(pvalue, int) :
+         if not isinstance(pvalue, int) :
            exit("ERROR: " + pname + " needs to be a string!!")
      elif ptype == "float":
          if not isinstance(pvalue, (float, int)):
            exit("ERROR: " + pname + " needs to be a numeric value!!")
      elif ptype == "class":
-        not_matching = [x for x in pvalue.split(",") if x not in p["value"]]
-        if not_matching != [] :
-           exit("ERROR: " + pname + " needs to have one of these values: " + ' '.join(p["value"]) + "!!\n" + ' '.join(not_matching) + " did not match")
+         not_matching = [x for x in pvalue.split(",") if x not in p["value"]]
+         if not_matching != [] :
+            exit("ERROR: " + pname + " needs to have one of these values: " + ' '.join(p["value"]) + "!!\n" + ' '.join(not_matching) + " did not match")
 
 
      # Mass tolerances: do they include Da or ppm exclusively?
@@ -110,7 +110,7 @@
        ols_out = olsclient.search(pvalue, ontology="MS", exact=True)
        if ols_out == None or len(ols_out) > 1 :
          exit("ERROR: enzyme " + pvalue + " not found in the MS ontology, see https://bioportal.bioontology.org/ontologies/MS/?p=classes&conceptid=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001045 for available terms")
-      pvalue = "NT=" + pvalue + ";AC=" + ols_out[0]["short_form"]
+       pvalue = "NT=" + pvalue + ";AC=" + ols_out[0]["short_form"]
 
        
    ## Modifications: look up in Unimod

From 72eae10c23037feaae4db52d4570b517021975a4 Mon Sep 17 00:00:00 2001
From: veitveit <veitveit@gmail.com>
Date: Fri, 11 Feb 2022 08:05:43 +0100
Subject: [PATCH 6/8] simplified loop

---
 sdrf_pipelines/maxquant/maxquant.py                | 4 ++--
 sdrf_pipelines/sdrf_merge/add_datanalysis_param.py | 7 ++-----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/sdrf_pipelines/maxquant/maxquant.py b/sdrf_pipelines/maxquant/maxquant.py
index 45928d5..28c8791 100644
--- a/sdrf_pipelines/maxquant/maxquant.py
+++ b/sdrf_pipelines/maxquant/maxquant.py
@@ -1635,8 +1635,8 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
             for p in file2params.keys():
                 if (len(file2params[p]) > 0):
                     datanalysisparams[p] = file2params[p][key1]
-                
-            
+
+
             if tag == 0 and tmp == []:
                 int_node = doc.createElement('int')
                 int_node.appendChild(doc.createTextNode('0'))
diff --git a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
index 606105d..81e0862 100644
--- a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
+++ b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
@@ -48,7 +48,7 @@
    if "variable_mods" in params_in.keys():
       ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=variable")) ]
       mod_columns.drop(ttt, axis=1, inplace=True)
-      overwritten.add("variable_mods")  
+      overwritten.add("variable_mods")
 else:
    ## THROW ERROR FOR MISSING SDRF
    exit("ERROR: No SDRF file given. Add an at least minimal version\nFor more details, see https://github.com/bigbio/proteomics-metadata-standard/tree/master/sdrf-proteomics")
@@ -71,8 +71,6 @@
 
    psdrf = "comment[" + p["sdrf"] + "]"
 
-
-
    if (psdrf) in sdrf_content.keys() :
      if (len(set(sdrf_content[psdrf])) > 1) :
          exit("ERROR: multiple values for parameter " + pname + " in sdrf file\n We recommend separating the file into parts with the same data analysis parameters")
@@ -112,8 +110,7 @@
          exit("ERROR: enzyme " + pvalue + " not found in the MS ontology, see https://bioportal.bioontology.org/ontologies/MS/?p=classes&conceptid=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001045 for available terms")
        pvalue = "NT=" + pvalue + ";AC=" + ols_out[0]["short_form"]
 
-       
-   ## Modifications: look up in Unimod
+     ## Modifications: look up in Unimod
      elif pname == "fixed_mods" or pname == "variable_mods" :
         mods = pvalue.split(",")
         for m in mods :

From 05e0d5579f6fe13608033cfba7711b9d3c346236 Mon Sep 17 00:00:00 2001
From: veitveit <veitveit@gmail.com>
Date: Fri, 11 Feb 2022 08:17:40 +0100
Subject: [PATCH 7/8] simplified loop

---
 sdrf_pipelines/maxquant/maxquant.py                | 8 ++++----
 sdrf_pipelines/sdrf_merge/add_datanalysis_param.py | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/sdrf_pipelines/maxquant/maxquant.py b/sdrf_pipelines/maxquant/maxquant.py
index 28c8791..a8f4200 100644
--- a/sdrf_pipelines/maxquant/maxquant.py
+++ b/sdrf_pipelines/maxquant/maxquant.py
@@ -1646,7 +1646,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
                 if 'Lys8' in file2label[key1] or 'Arg10' in file2label[key1] or 'Arg6' in \
                    file2label[key1] or 'Lys6' in file2label[key1]:
                      parameterGroup['0']['silac_shape'] = file2silac_shape[key1]
- 
+
             elif {instr_val: value2} in tmp:
                 int_node = doc.createElement('int')
                 int_text = doc.createTextNode(str(tag))
@@ -1660,7 +1660,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
                 tmp.append({instr_val: value2})
                 parameterGroup[str(tag)] = {'instrument': file2instrument[key1], 'label': file2label[key1] , 'mods': file2mods[key1], 'enzyme': file2enzyme[key1],
                                             'pctol': file2pctol[key1], 'fragtol': file2fragtol[key1],'pctolunit': file2pctolunit[key1], 'fragtolunit': file2fragtolunit[key1], 'datparams': datparams}
- 
+
                 if 'Lys8' in file2label[key1] or 'Arg10' in file2label[key1] or 'Arg6' in file2label[key1] \
                         or 'Lys6' in file2label[key1]:
                     parameterGroup[str(tag)]['silac_shape'] = file2silac_shape[key1]
@@ -1901,7 +1901,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
                 maxNmods.appendChild(doc.createTextNode('5'))
 
 
-            
+
             maxMissedCleavages = doc.createElement('maxMissedCleavages')
             maxMissedCleavages.appendChild(doc.createTextNode('2'))
             enzymeMode = doc.createElement('enzymeMode')
@@ -2030,7 +2030,7 @@ def maxquant_convert(self, sdrf_file, fastaFilePath, mqconfdir, matchBetweenRuns
                     searchTolInPpm = doc.createElement('searchTolInPpm')
                     searchTolInPpm.appendChild(doc.createTextNode('False'))
             isotopeMatchTol = doc.createElement('isotopeMatchTol')
-            
+
             isotopeMatchTol.appendChild(doc.createTextNode('2'))
             isotopeMatchTolInPpm = doc.createElement('isotopeMatchTolInPpm')
             isotopeMatchTolInPpm.appendChild(doc.createTextNode('True'))
diff --git a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
index 81e0862..588a800 100644
--- a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
+++ b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
@@ -138,9 +138,9 @@
    ## Now finally writing the value
    else :
      sdrf_content[psdrf] = pvalue
-         
 
-   
+
+
 
 ## OVERWRITE RAW FILES IF GIVEN TO DIRECT TO THE CORRECT LOCATION?
 

From ad64539f6d85d16035d6b6f462513ad26f475238 Mon Sep 17 00:00:00 2001
From: veitveit <veitveit@gmail.com>
Date: Thu, 17 Mar 2022 19:42:04 +0100
Subject: [PATCH 8/8] become flake8 compliant

---
 .../sdrf_merge/add_datanalysis_param.py       | 262 ++++++++++--------
 1 file changed, 145 insertions(+), 117 deletions(-)

diff --git a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
index 588a800..f3252f6 100644
--- a/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
+++ b/sdrf_pipelines/sdrf_merge/add_datanalysis_param.py
@@ -1,169 +1,197 @@
 import pandas as pd
 import re
 import yaml
-#import sdrf_pipelines
 import os.path
 from sdrf_pipelines.zooma.zooma import OlsClient
 from sdrf_pipelines.openms.unimod import UnimodDatabase
 from sdrf_pipelines.sdrf.sdrf import SdrfDataFrame
 
-## Accessing ontologies and CVs
+# Accessing ontologies and CVs
 unimod = UnimodDatabase()
 olsclient = OlsClient()
-#print(ols_out)
+# print(ols_out)
+
+field_types = {"boolean": bool, "str": str, "integer": int, "float": (float, int)}
+
+
+# Function for consistency checks
+def verify_content(pname, pvalue, ptype):
+    # for each type: check consistency
+    # print(type(pvalue))
+    if ptype in field_types.keys():
+        if not isinstance(pvalue, field_types[ptype]):
+            exit("ERROR: " + pname + " needs to be " + ptype + "!!")
+#    if ptype == "boolean":
+#        if not isinstance(pvalue, bool):
+#            exit("ERROR: " + pname + " needs to be either \"true\" or \"false\"!!")
+#    elif ptype == "str":
+#        if not isinstance(pvalue, str):
+#            exit("ERROR: " + pname + " needs to be a string!!")
+#    elif ptype == "integer":
+#        if not isinstance(pvalue, int):
+#            exit("ERROR: " + pname + " needs to be a string!!")
+#    elif ptype == "float":
+#        if not isinstance(pvalue, (float, int)):
+#            exit("ERROR: " + pname + " needs to be a numeric value!!")
+    elif ptype == "class":
+        not_matching = [x for x in pvalue.split(",") if x not in p["value"]]
+        if not_matching != []:
+            exit("ERROR: " + pname + " needs to have one of these values: " + ' '.join(p["value"]) + "!!\n" +
+                 ' '.join(not_matching) + " did not match")
+
+    # Mass tolerances: do they include Da or ppm exclusively?
+    if pname == "fragment_mass_tolerance" or pname == "precursor_mass_tolerance":
+        unit = pvalue.split(" ")[1]
+        if unit != "Da" and unit != "ppm":
+            exit("ERROR: " + pname + " allows only units of \"Da\" and \"ppm\", separated by space from the \
+value!!\nWe found " + unit)
+    # ENZYME AND MODIFICATIONS: LOOK UP ONTOLOGY VALUES
+    elif pname == "enzyme":
+        ols_out = olsclient.search(pvalue, ontology="MS", exact=True)
+        if ols_out is None:
+            exit("ERROR: enzyme " + pvalue + " not found in the MS ontology, see \
+https://bioportal.bioontology.org/ontologies/MS/?p=classes&conceptid=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001045 \
+for available terms")
+        pvalue = "NT=" + pvalue + ";AC=" + ols_out[0]["short_form"]
+    return pvalue
+
+
+def new_or_default(params_in, pname, p):
+    if(pname in list(params_in.keys())):
+        print("Found in parameter file")
+        pvalue = params_in[pname]
+    else:
+        print("Setting to default: " + p["default"])
+        pvalue = p["default"]
+    return(pvalue)
+
+
+# Function to load modifications
+def add_ptms(mods, pname, mod_columns):
+    for m in mods:
+        tmod = m.split(" of ")
+        if len(tmod) < 2:
+            exit("ERROR: Something wrong with the modification entry " + m + ". It should be PSI_MS_NAME of RESIDUE. \
+Note that it should be single residues")
+        modname = tmod[0]
+        modpos = tmod[1]
+        found = [x for x in unimod.modifications if modname == x.get_name()]
+        if found == []:
+            exit("ERROR: " + m + " not found in Unimod. Check the \"PSI-MS Names\" in unimod.org. Also check whether you \
+used space between the comma separated modifications")
+        modtype = pname.replace("_mods", "")
+        if re.match("[A-Z]", modpos):
+            mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" +\
+                 modtype + ";TA=" + modpos
+        elif modpos in ["Protein N-term", "Protein C-term", "Any N-term", "Any C-term"]:
+            mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" +\
+                modtype + ";PP=" + modpos
+        else:
+            exit("ERROR: Wrong residue given: " + modpos + ". Should be either one upper case letter or any of \"Protein N-term\", \
+ \"Protein C-term\", \"Any N-term\", \"Any C-term\"")
+    return mod_columns
+
 
 # modifications have the same column name, not working with pandas
 # therefore separated
 mod_columns = pd.DataFrame()
 
-## For summary at the end
+# For summary at the end
 overwritten = set()
 
 with open(r'param2sdrf.yml') as file:
-   param_mapping = yaml.safe_load(file)
-   mapping = param_mapping["parameters"]
+    param_mapping = yaml.safe_load(file)
+    mapping = param_mapping["parameters"]
 
 
-## READ PARAMETERS FOR RUNNING WORKFLOW
+# READ PARAMETERS FOR RUNNING WORKFLOW
 with open(r'params.yml') as file:
-   tparams_in = yaml.safe_load(file)
-   params_in = tparams_in["params"]
-   rawfiles = tparams_in["rawfiles"]
-   fastafile = tparams_in["fastafile"]
-
+    tparams_in = yaml.safe_load(file)
+    params_in = tparams_in["params"]
+    rawfiles = tparams_in["rawfiles"]
+    fastafile = tparams_in["fastafile"]
 
-
-## WE NEED AN SDRF FILE FOR THE EXPERIMENTAL DESIGN, CONTAINING FILE LOCATIONS
+# WE NEED AN SDRF FILE FOR THE EXPERIMENTAL DESIGN, CONTAINING FILE LOCATIONS
 sdrf_content = pd.DataFrame()
 has_sdrf = os.path.isfile("./sdrf.tsv")
-if has_sdrf :
-   sdrf_content = pd.read_csv("sdrf.tsv", sep="\t")
-   mod_columns = sdrf_content.filter(like="comment[modification parameters]")
-   sdrf_content = sdrf_content.drop(columns=mod_columns.columns)
-   # delete columns with fixed/variable modification info
-   if "fixed_mods" in params_in.keys():
-      ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=fixed")) ]
-      mod_columns.drop(ttt, axis=1, inplace=True)
-      overwritten.add("fixed_mods")
-   if "variable_mods" in params_in.keys():
-      ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=variable")) ]
-      mod_columns.drop(ttt, axis=1, inplace=True)
-      overwritten.add("variable_mods")
+if has_sdrf:
+    sdrf_content = pd.read_csv("sdrf.tsv", sep="\t")
+    mod_columns = sdrf_content.filter(like="comment[modification parameters]")
+    sdrf_content = sdrf_content.drop(columns=mod_columns.columns)
+    sdrf_content["comment[modification parameters]"] = None
+    # delete columns with fixed/variable modification info
+    if "fixed_mods" in params_in.keys():
+        ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=fixed"))]
+        mod_columns.drop(ttt, axis=1, inplace=True)
+        overwritten.add("fixed_mods")
+    if "variable_mods" in params_in.keys():
+        ttt = [x for x in mod_columns.columns if any(mod_columns[x].str.contains("MT=variable"))]
+        mod_columns.drop(ttt, axis=1, inplace=True)
+        overwritten.add("variable_mods")
 else:
-   ## THROW ERROR FOR MISSING SDRF
-   exit("ERROR: No SDRF file given. Add an at least minimal version\nFor more details, see https://github.com/bigbio/proteomics-metadata-standard/tree/master/sdrf-proteomics")
+    # THROW ERROR FOR MISSING SDRF
+    exit("ERROR: No SDRF file given. Add an at least minimal version\nFor more details, \
+see https://github.com/bigbio/proteomics-metadata-standard/tree/master/sdrf-proteomics")
 
 
-## FIRST STANDARD PARAMETERS
+# FIRST STANDARD PARAMETERS
 # FOR GIVEN PARAMETERS
 # CHECK WHETHER COLUMN IN SDRF TO PUT WARNING AND OVERWRITE
 # IF NOT GIVEN, WRITE COLUMN
 for p in mapping:
-   pname = p["name"]
-   ptype = p["type"]
-   print("---- Parameter: " + pname + ": ----")
-   if(pname in list(params_in.keys())) :
-      print("Found in parameter file")
-      pvalue = params_in[pname]
-   else:
-      print("Setting to default: " + p["default"])
-      pvalue = p["default"]
-
-   psdrf = "comment[" + p["sdrf"] + "]"
-
-   if (psdrf) in sdrf_content.keys() :
-     if (len(set(sdrf_content[psdrf])) > 1) :
-         exit("ERROR: multiple values for parameter " + pname + " in sdrf file\n We recommend separating the file into parts with the same data analysis parameters")
-     print("WARNING: Overwriting values in sdrf file with " + pvalue)
-     overwritten.add(pname)
-
-     # for each type: check consistency
-     #print(type(pvalue))
-     if ptype == "boolean":
-         if not isinstance(pvalue, bool) :
-           exit("ERROR: " + pname + " needs to be either \"true\" or \"false\"!!")
-     elif ptype == "str":
-         if not isinstance(pvalue, str) :
-           exit("ERROR: " + pname + " needs to be a string!!")
-     elif ptype == "integer":
-         if not isinstance(pvalue, int) :
-           exit("ERROR: " + pname + " needs to be a string!!")
-     elif ptype == "float":
-         if not isinstance(pvalue, (float, int)):
-           exit("ERROR: " + pname + " needs to be a numeric value!!")
-     elif ptype == "class":
-         not_matching = [x for x in pvalue.split(",") if x not in p["value"]]
-         if not_matching != [] :
-            exit("ERROR: " + pname + " needs to have one of these values: " + ' '.join(p["value"]) + "!!\n" + ' '.join(not_matching) + " did not match")
-
-
-     # Mass tolerances: do they include Da or ppm exclusively?
-     if pname == "fragment_mass_tolerance" or pname == "precursor_mass_tolerance":
-        unit = pvalue.split(" ")[1]
-        if unit != "Da" and unit != "ppm" :
-          exit("ERROR: " + pname + " allows only units of \"Da\" and \"ppm\", separated by space from the value!!\nWe found " + unit)
-
-     ## ENZYME AND MODIFICATIONS: LOOK UP ONTOLOGY VALUES
-     elif pname == "enzyme":
-       ols_out = olsclient.search(pvalue, ontology="MS", exact=True)
-       if ols_out == None or len(ols_out) > 1 :
-         exit("ERROR: enzyme " + pvalue + " not found in the MS ontology, see https://bioportal.bioontology.org/ontologies/MS/?p=classes&conceptid=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FMS_1001045 for available terms")
-       pvalue = "NT=" + pvalue + ";AC=" + ols_out[0]["short_form"]
+    pname = p["name"]
+    ptype = p["type"]
+    print("---- Parameter: " + pname + ": ----")
 
-     ## Modifications: look up in Unimod
-     elif pname == "fixed_mods" or pname == "variable_mods" :
-        mods = pvalue.split(",")
-        for m in mods :
-           tmod = m.split(" of ")
-           if len(tmod) < 2:
-              exit("ERROR: Something wrong with the modification entry " + m + ". It should be PSI_MS_NAME of RESIDUE. Note that it should be single residues")
-           modname = tmod[0]
-           modpos = tmod[1]
-           found = [x for x in unimod.modifications if modname == x.get_name()]
-           if found == [] :
-              exit("ERROR: " + m + " not found in Unimod. Check the \"PSI-MS Names\" in unimod.org. Also check whether you used space between the comma separated modifications")
-           modtype = "fixed"
-           if (pname == "variable_mods") : modtype = "variable"
-           if re.match("[A-Z]",modpos) :
-              mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" + modtype + ";TA=" + modpos
-           elif modpos in ["Protein N-term", "Protein C-term", "Any N-term", "Any C-term"] :
-              mod_columns[len(mod_columns.columns)+1] = "NT=" + modname + ";AC=" + found[0].get_accession() + ";MT=" + modtype + ";PP=" + modpos
-           else :
-              exit("ERROR: Wrong residue given: " + modpos + ". Should be either one upper case letter or any of \"Protein N-term\", \"Protein C-term\", \"Any N-term\", \"Any C-term\"")
+    pvalue = new_or_default(params_in, pname, p)
 
+    psdrf = "comment[" + p["sdrf"] + "]"
+    if psdrf in sdrf_content.keys():
+        if (len(set(sdrf_content[psdrf])) > 1):
+            exit("ERROR: multiple values for parameter " + pname + " in sdrf file\n We recommend separating \
+the file into parts with the same data analysis parameters")
 
-#print(found_list[0].get_name())
+        pvalue = verify_content(pname, pvalue, ptype)
 
+        # Modifications: look up in Unimod
+        if pname in ["fixed_mods", "variable_mods"] and pname in overwritten:
+            mods = pvalue.split(",")
+            print("WARNING: Overwriting " + pname + " values in sdrf file with " + pvalue)
+            mod_columns = add_ptms(mods, pname, mod_columns)
 
-   ## Now finally writing the value
-   else :
-     sdrf_content[psdrf] = pvalue
+        # Now finally writing the value
+        elif pname not in ["fixed_mods", "variable_mods"]:
+            print("WARNING: Overwriting " + pname + " values in sdrf file with " + pvalue)
+            overwritten.add(pname)
+            sdrf_content[psdrf] = pvalue
 
+    else:
+        sdrf_content[psdrf] = pvalue
 
+# OVERWRITE RAW FILES IF GIVEN TO DIRECT TO THE CORRECT LOCATION?
 
+# ADD FASTA FILE TO SDRF (COMMENT:FASTA DATABASE FILE)?
 
-## OVERWRITE RAW FILES IF GIVEN TO DIRECT TO THE CORRECT LOCATION?
+# WRITE EXPERIMENTAL DESIGN IF NO SDRF?
 
-## ADD FASTA FILE TO SDRF (COMMENT:FASTA DATABASE FILE)?
-
-## WRITE EXPERIMENTAL DESIGN IF NO SDRF?
-
-
-## adding modification columns
-colnames = list(sdrf_content.columns)  + ["comment[modification parameters]"] * len(mod_columns.columns)
+# adding modification columns
+colnames = list(sdrf_content.columns) + ["comment[modification parameters]"] * len(mod_columns.columns)
 
 sdrf_content = pd.concat([sdrf_content, mod_columns], axis=1)
+sdrf_content.columns = colnames
+
+sdrf_content.dropna(how='all', axis=1, inplace=True)
 
 print("--- Writing sdrf file into sdrf_local.tsv ---")
-sdrf_content.to_csv("sdrf_local.tsv", sep="\t", header=colnames, index=False)
+# sdrf_content.to_csv("sdrf_local.tsv", sep="\t", header=colnames, index=False)
+sdrf_content.to_csv("sdrf_local.tsv", sep="\t")
 
-## Verify with sdrf-parser
+# Verify with sdrf-parser
 check_sdrf = SdrfDataFrame()
 check_sdrf.parse("sdrf_local.tsv")
 check_sdrf.validate("mass_spectrometry")
 
-
 print("########## SUMMARY #########")
 print("--- The following parameters have been overwritten in the sdrf file: ---")
-for p in overwritten :
-   print(p)
+for p in overwritten:
+    print(p)