Roestlab · singjc · May 31, 2024 · May 31, 2024 · Jun 2, 2024 · Jun 2, 2024
diff --git a/massdash/loaders/TrafoXMLLoader.py b/massdash/loaders/TrafoXMLLoader.py
@@ -0,0 +1,74 @@
+"""
+massdash/loaders/TrafoXMLLoader
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+"""
+
+from typing import List, Dict, Union
+
+# Access
+from .access.TrafoXMLAccess import TrafoXMLAccess
+
+class TrafoXMLLoader:
+    def __init__(self, 
+                 dataFiles: Union[str, List[str]], 
+                 libraryFile: Union[str, List[str]] = None, 
+                 mzDebugFile: Union[str, List[str]] = None, 
+                 imDebugFile: Union[str, List[str]] = None):
+        ## store the file names
+        if isinstance(dataFiles, str):
+            self.dataFiles_str = [dataFiles]
+        else:
+            self.dataFiles_str = dataFiles
+
+        if isinstance(libraryFile, str):
+            self.libraryFile_str = [libraryFile]
+        else:
+            self.libraryFile_str = libraryFile
+
+        if isinstance(mzDebugFile, str):
+            self.mzDebugFile_str = [mzDebugFile]
+        else:
+            self.mzDebugFile_str = mzDebugFile
+
+        if isinstance(imDebugFile, str):
+            self.imDebugFile_str = [imDebugFile]
+        else:
+            self.imDebugFile_str = imDebugFile
+
+        if self.libraryFile_str is not None and len(self.libraryFile_str) > 1:
+            self.dataFiles = [TrafoXMLAccess(f, l, mz_f, im_f) for f, l, mz_f, im_f in zip(self.dataFiles_str, self.libraryFile_str, self.mzDebugFile_str, self.imDebugFile_str)]
+        elif self.libraryFile_str is not None and len(self.libraryFile_str) == 1:
+            self.dataFiles = [TrafoXMLAccess(f, self.libraryFile_str[0], mz_f, im_f) for f, mz_f, im_f in zip(self.dataFiles_str, self.mzDebugFile_str, self.imDebugFile_str)]
+
+    def __str__(self):
+        return f"TrafoXMLLoader(dataFiles={self.dataFiles_str}, libraryFile={self.libraryFile_str}"
+
+    def __repr__(self):
+        return f"TrafoXMLLoader(dataFiles={self.dataFiles_str}, libraryFile={self.libraryFile_str}"
+
+
+    def plot(self, debug_plot_type = "rt", split: bool = False):
+        import pandas as pd
+        from bokeh.plotting import show
+
+        from massdash.plotting.DebugPlotter import DebugPlotter
+
+        debug_plot_type_map = {
+            "rt": ['experiment_rt', 'library_rt', 'Retention Time Transformation', 'Original RT [s]', 'Delta RT [s]'],
+            "mz": ['mz', 'theo_mz', 'm/z calibration', 'Experiment m/z', 'Theoretical m/z'],
+            "im": ['im', 'theo_im', 'Ion mobility calibration', 'Experiment Ion Mobility', 'Theoretical Ion Mobility']
+        }
+
+        if split:
+            for i in range(len(self.dataFiles)):
+                df = self.dataFiles[i].load_pairs_df()
+                plotter = DebugPlotter()
+                p = plotter.plot(df, *debug_plot_type_map[debug_plot_type])
+                show(p)
+        else:
+            df = [self.dataFiles[i].load_pairs_df() for i in range(len(self.dataFiles))]
+            df = pd.concat(df)
+
+            plotter = DebugPlotter()
+            p = plotter.plot(df, *debug_plot_type_map[debug_plot_type])
+            show(p)
diff --git a/massdash/loaders/access/TrafoXMLAccess.py b/massdash/loaders/access/TrafoXMLAccess.py
@@ -0,0 +1,112 @@
+"""
+massdash/loaders/access/TrafoXMLAccess
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+"""
+
+from typing import List, Dict, Tuple
+from os.path import basename
+import xml.etree.ElementTree as ET
+import pandas as pd
+
+
+# Library Access
+from ..SpectralLibraryLoader import SpectralLibraryLoader
+
+class TrafoXMLAccess:
+    """
+    A class for accessing and loading data from a TrafoXML file.
+
+    Args:
+        input_file (str): The path to the TrafoXML file.
+        irt_library (str, optional): The path to the IRT library file. Defaults to None.
+
+    Attributes:
+        input_file_str (str): The path to the TrafoXML file.
+        tree (ElementTree): The parsed XML tree from the TrafoXML file.
+        root (Element): The root element of the XML tree.
+        irt_library_str (str): The path to the IRT library file.
+        irt_library (SpectralLibraryLoader): The loaded IRT library.
+
+    Methods:
+        load_transformation_params: Loads the transformation parameters from the TrafoXML file.
+        load_pairs: Loads the transformation pairs from the TrafoXML file.
+        load_pairs_df: Loads the transformation pairs as a pandas DataFrame.
+    """
+
+    def __init__(self, 
+                 input_file: str, 
+                 irt_library: str = None, 
+                 mzDebugFile: str = None, 
+                 imDebugFile: str = None) -> None:
+        self.input_file_str = input_file
+        self.tree = ET.parse(self.input_file_str)
+        self.root = self.tree.getroot()
+        self.irt_library_str = irt_library
+        self.mzDebugFile_str = mzDebugFile
+        self.imDebugFile_str = imDebugFile
+
+        if self.irt_library_str is not None:
+            self.irt_library = SpectralLibraryLoader(self.irt_library_str)
+            self.irt_library.load()
+
+        if self.mzDebugFile_str is not None:
+            self.mzDebugFile = pd.read_csv(self.mzDebugFile_str, sep='\t')
+
+        if self.imDebugFile_str is not None:
+            self.imDebugFile = pd.read_csv(self.imDebugFile_str, sep='\t')
+
+    def load_transformation_params(self) -> Dict:
+        """
+        Loads the transformation parameters from the TrafoXML file.
+
+        Returns:
+            dict: A dictionary containing the transformation parameters.
+
+        """
+        transformation = self.root.find('Transformation')
+        params = {param.attrib['name']: param.attrib['value'] for param in transformation.findall('Param')}
+        return params
+
+    def load_pairs(self) -> List[Tuple[float, float]]:
+        """
+        Loads the transformation pairs from the TrafoXML file.
+
+        Returns:
+            List[Tuple[float, float]]: A list of tuples representing the transformation pairs.
+
+        """
+        transformation = self.root.find('Transformation')
+        pairs = [(float(pair.attrib['from']), float(pair.attrib['to'])) for pair in transformation.find('Pairs')]
+        return pairs
+
+    def load_pairs_df(self) -> pd.DataFrame:
+        """
+        Loads the transformation pairs as a pandas DataFrame.
+
+        Returns:
+            pd.DataFrame: A DataFrame containing the transformation pairs.
+
+        """
+        pairs = self.load_pairs()
+        df = pd.DataFrame(pairs, columns=['experiment_rt', 'library_rt'])
+
+        # Add irt precursor information to table if irt_library is available
+        if self.irt_library_str is not None:
+            irt_prec_meta = self.irt_library.data[['GeneName', 'ProteinId', 'ModifiedPeptideSequence',
+       'PrecursorMz', 'PrecursorCharge', 'ProductMz', 'ProductCharge', 'Annotation', 'NormalizedRetentionTime',
+       'PrecursorIonMobility']].drop_duplicates()
+            df = pd.merge(df, irt_prec_meta, left_on='library_rt', right_on='NormalizedRetentionTime', how='inner')
+
+        if self.mzDebugFile_str is not None:
+            df = pd.merge(df, self.mzDebugFile, left_on='experiment_rt', right_on='RT', how='inner')
+            # Drop RT column, since it's the same as experiment_rt
+            df = df.drop(columns=['RT'])
+
+        if self.imDebugFile_str is not None:
+            df = pd.merge(df, self.imDebugFile[['RT', 'im', 'theo_im', 'intensity']], left_on='experiment_rt', right_on='RT', how='inner')
+            # Drop RT column, since it's the same as experiment_rt
+            df = df.drop(columns=['RT'])
+
+        df['filename'] = basename(self.input_file_str).split('.')[0]
+
+        return df
diff --git a/massdash/plotting/DebugPlotter.py b/massdash/plotting/DebugPlotter.py
@@ -0,0 +1,71 @@
+"""
+massdash/plotting/GenericPlotter
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+"""
+
+from typing import List, Optional, Literal
+
+# Plotting modules
+from bokeh.plotting import figure, show, output_notebook
+from bokeh.models import HoverTool, ColumnDataSource, PrintfTickFormatter, LegendItem, Legend
+from bokeh.palettes import Category20
+
+
+
+class DebugPlotter:
+    def __init__(self):
+        self.fig = None
+
+    def plot(self, df, x_col, y_col, title, x_axis_label, y_axis_label):
+        # Create a new plot
+        p = figure(title=title, x_axis_label=x_axis_label, y_axis_label=y_axis_label,
+                tools=['pan', 'wheel_zoom', 'box_zoom', 'reset', 'save'])
+
+        unique_filenames = df['filename'].unique()
+        if len(unique_filenames) == 1:
+            colors = ['blue']
+        elif len(unique_filenames) <= 20:
+            colors = Category20[len(unique_filenames)]
+        else:
+            raise ValueError("Too many files to plot (>20), not enought colors available")
+
+        legend_it = []
+        file_number = 1
+        for filename, grouped_df in df.groupby('filename'):
+            color = colors[file_number - 1]
+            print(f"File {file_number}: {filename}")
+            # Add the scatter plot
+            source = ColumnDataSource(grouped_df)
+            renderer = p.scatter(x_col, y_col, source=source, size=10, alpha=0.5, color=color)
+            legend_it.append((f"File {file_number}", [renderer]))
+            file_number += 1
+
+        # Configure the minimal hover tool
+        hover_minimal = HoverTool(tooltips=[
+            (x_axis_label, f'@{x_col}{{0.0}}'),
+            (y_axis_label, f'@{y_col}{{0.0}}'),
+            ('Peptide Sequence', '@ModifiedPeptideSequence')
+        ], name="Minimal Hover")
+        p.add_tools(hover_minimal)
+
+        # Configure the detailed hover tool
+        hover_detailed = HoverTool(tooltips=[
+            ('Protein ID', '@ProteinId'),
+            ('Precursor m/z', '@PrecursorMz{0.4}'),
+            ('Precursor Charge', '@PrecursorCharge'),
+            ('Normalized Retention Time', '@NormalizedRetentionTime{0.2}'),
+            ('Precursor Ion Mobility', '@PrecursorIonMobility{0.6}'),
+            ('Filename', '''<div style="width:200px; word-wrap:break-word;">@filename</div>''')
+        ], name="Detailed Hover")
+        p.add_tools(hover_detailed)
+
+        # Add a legend for the filename
+        legend = Legend(items=legend_it)
+        legend.click_policy = "mute"
+        legend.label_text_font_size = '8pt'
+        p.add_layout(legend, 'right')
+
+        # Format the tick labels to remove scientific notation
+        p.xaxis.formatter = PrintfTickFormatter(format='%.2f')
+        p.yaxis.formatter = PrintfTickFormatter(format='%.2f')
+        return p