From 0e5ec5f08cc20a7c43e315de973140fb1af22bae Mon Sep 17 00:00:00 2001 From: singjc Date: Fri, 31 May 2024 11:42:57 -0400 Subject: [PATCH 1/7] add: trafoXML accessor --- massdash/loaders/access/TrafoXMLAccess.py | 86 +++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 massdash/loaders/access/TrafoXMLAccess.py diff --git a/massdash/loaders/access/TrafoXMLAccess.py b/massdash/loaders/access/TrafoXMLAccess.py new file mode 100644 index 0000000..279a589 --- /dev/null +++ b/massdash/loaders/access/TrafoXMLAccess.py @@ -0,0 +1,86 @@ +""" +massdash/loaders/access/TrafoXMLAccess +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +""" + +import xml.etree.ElementTree as ET +import pandas as pd +from typing import List, Tuple + +# Library Access +from ..SpectralLibraryLoader import SpectralLibraryLoader + +class TrafoXMLAccess: + """ + A class for accessing and loading data from a TrafoXML file. + + Args: + input_file (str): The path to the TrafoXML file. + irt_library (str, optional): The path to the IRT library file. Defaults to None. + + Attributes: + input_file_str (str): The path to the TrafoXML file. + tree (ElementTree): The parsed XML tree from the TrafoXML file. + root (Element): The root element of the XML tree. + irt_library_str (str): The path to the IRT library file. + irt_library (SpectralLibraryLoader): The loaded IRT library. + + Methods: + load_transformation_params: Loads the transformation parameters from the TrafoXML file. + load_pairs: Loads the transformation pairs from the TrafoXML file. + load_pairs_df: Loads the transformation pairs as a pandas DataFrame. + """ + + def __init__(self, input_file: str, irt_library: str = None) -> None: + self.input_file_str = input_file + self.tree = ET.parse(self.input_file_str) + self.root = self.tree.getroot() + self.irt_library_str = irt_library + + if self.irt_library_str is not None: + self.irt_library = SpectralLibraryLoader(self.irt_library_str) + self.irt_library.load() + + def load_transformation_params(self) -> dict: + """ + Loads the transformation parameters from the TrafoXML file. + + Returns: + dict: A dictionary containing the transformation parameters. + + """ + transformation = self.root.find('Transformation') + params = {param.attrib['name']: param.attrib['value'] for param in transformation.findall('Param')} + return params + + def load_pairs(self) -> List[Tuple[float, float]]: + """ + Loads the transformation pairs from the TrafoXML file. + + Returns: + List[Tuple[float, float]]: A list of tuples representing the transformation pairs. + + """ + transformation = self.root.find('Transformation') + pairs = [(float(pair.attrib['from']), float(pair.attrib['to'])) for pair in transformation.find('Pairs')] + return pairs + + def load_pairs_df(self) -> pd.DataFrame: + """ + Loads the transformation pairs as a pandas DataFrame. + + Returns: + pd.DataFrame: A DataFrame containing the transformation pairs. + + """ + pairs = self.load_pairs() + df = pd.DataFrame(pairs, columns=['experiment_rt', 'library_rt']) + + # Add irt precursor information to table if irt_library is available + if self.irt_library_str is not None: + irt_prec_meta = self.irt_library.data[['GeneName', 'ProteinId', 'ModifiedPeptideSequence', + 'PrecursorMz', 'PrecursorCharge', 'NormalizedRetentionTime', + 'PrecursorIonMobility']].drop_duplicates() + df = pd.merge(df, irt_prec_meta, left_on='library_rt', right_on='NormalizedRetentionTime', how='inner') + + return df From 90b587328e79de649f8d05587eee2a69b2c693ec Mon Sep 17 00:00:00 2001 From: singjc Date: Fri, 31 May 2024 11:45:27 -0400 Subject: [PATCH 2/7] change: dict type set to Dict --- massdash/loaders/access/TrafoXMLAccess.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/massdash/loaders/access/TrafoXMLAccess.py b/massdash/loaders/access/TrafoXMLAccess.py index 279a589..67dd0d3 100644 --- a/massdash/loaders/access/TrafoXMLAccess.py +++ b/massdash/loaders/access/TrafoXMLAccess.py @@ -3,9 +3,10 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ """ +from typing import List, Dict, Tuple import xml.etree.ElementTree as ET import pandas as pd -from typing import List, Tuple + # Library Access from ..SpectralLibraryLoader import SpectralLibraryLoader @@ -41,7 +42,7 @@ def __init__(self, input_file: str, irt_library: str = None) -> None: self.irt_library = SpectralLibraryLoader(self.irt_library_str) self.irt_library.load() - def load_transformation_params(self) -> dict: + def load_transformation_params(self) -> Dict: """ Loads the transformation parameters from the TrafoXML file. From f4aa597f91a71af13061a5ac73319f992f226389 Mon Sep 17 00:00:00 2001 From: singjc Date: Sat, 1 Jun 2024 23:39:56 -0400 Subject: [PATCH 3/7] add: loader and plotter for trafoXML --- massdash/loaders/TrafoXMLLoader.py | 36 ++++++++++++ massdash/loaders/access/TrafoXMLAccess.py | 2 + massdash/plotting/DebugPlotter.py | 67 +++++++++++++++++++++++ 3 files changed, 105 insertions(+) create mode 100644 massdash/loaders/TrafoXMLLoader.py create mode 100644 massdash/plotting/DebugPlotter.py diff --git a/massdash/loaders/TrafoXMLLoader.py b/massdash/loaders/TrafoXMLLoader.py new file mode 100644 index 0000000..622af81 --- /dev/null +++ b/massdash/loaders/TrafoXMLLoader.py @@ -0,0 +1,36 @@ +""" +massdash/loaders/TrafoXMLLoader +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +""" + +from typing import List, Dict, Union + +# Access +from .access.TrafoXMLAccess import TrafoXMLAccess + +class TrafoXMLLoader: + def __init__(self, dataFiles: Union[str, List[str]], libraryFile: Union[str, List[str]] = None): + ## store the file names + if isinstance(dataFiles, str): + self.dataFiles_str = [dataFiles] + else: + self.dataFiles_str = dataFiles + + if isinstance(libraryFile, str): + self.libraryFile_str = [libraryFile] + else: + self.libraryFile_str = libraryFile + print(f"len libraryFile: {len(self.libraryFile_str)}") + + if self.libraryFile_str is not None and len(self.libraryFile_str) > 1: + self.dataFiles = [TrafoXMLAccess(f, l) for f, l in zip(self.dataFiles_str, self.libraryFile_str)] + elif self.libraryFile_str is not None and len(self.libraryFile_str) == 1: + self.dataFiles = [TrafoXMLAccess(f, self.libraryFile_str[0]) for f in self.dataFiles_str] + + def __str__(self): + return f"TrafoXMLLoader(dataFiles={self.dataFiles_str}, libraryFile={self.libraryFile_str}" + + def __repr__(self): + return f"TrafoXMLLoader(dataFiles={self.dataFiles_str}, libraryFile={self.libraryFile_str}" + + \ No newline at end of file diff --git a/massdash/loaders/access/TrafoXMLAccess.py b/massdash/loaders/access/TrafoXMLAccess.py index 67dd0d3..d93555b 100644 --- a/massdash/loaders/access/TrafoXMLAccess.py +++ b/massdash/loaders/access/TrafoXMLAccess.py @@ -4,6 +4,7 @@ """ from typing import List, Dict, Tuple +from os.path import basename import xml.etree.ElementTree as ET import pandas as pd @@ -83,5 +84,6 @@ def load_pairs_df(self) -> pd.DataFrame: 'PrecursorMz', 'PrecursorCharge', 'NormalizedRetentionTime', 'PrecursorIonMobility']].drop_duplicates() df = pd.merge(df, irt_prec_meta, left_on='library_rt', right_on='NormalizedRetentionTime', how='inner') + df['filename'] = basename(self.input_file_str).split('.')[0] return df diff --git a/massdash/plotting/DebugPlotter.py b/massdash/plotting/DebugPlotter.py new file mode 100644 index 0000000..44a7adb --- /dev/null +++ b/massdash/plotting/DebugPlotter.py @@ -0,0 +1,67 @@ +""" +massdash/plotting/GenericPlotter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +""" + +from typing import List, Optional, Literal + +# Plotting modules +from bokeh.plotting import figure, show, output_notebook +from bokeh.models import HoverTool, ColumnDataSource, PrintfTickFormatter, LegendItem, Legend +from bokeh.palettes import Category20 + + + +class DebugPlotter: + def __init__(self): + self.fig = None + + def plot(self, df): + # Create a new plot + p = figure(title='Retention time transformation', x_axis_label='original RT [s]', y_axis_label='Delta RT [s]', + tools=['pan', 'wheel_zoom', 'box_zoom', 'reset', 'save']) + + unique_filenames = df['filename'].unique() + colors = Category20[len(unique_filenames)] + + legend_it = [] + file_number = 1 + for filename, grouped_df in df.groupby('filename'): + + color = colors[file_number-1] + print(f"File {file_number}: {filename} | color: {color}") + # Add the scatter plot + source = ColumnDataSource(grouped_df) + renderer = p.scatter('experiment_rt', 'library_rt', source=source, size=10, alpha=0.5, color=color) + legend_it.append((f"File {file_number}", [renderer])) + file_number += 1 + + # Configure the minimal hover tool + hover_minimal = HoverTool(tooltips=[ + ('original RT', '@experiment_rt{0.0}'), + ('Delta RT', '@library_rt{0.0}'), + ('Peptide Sequence', '@ModifiedPeptideSequence') + ], name="Minimal Hover") + p.add_tools(hover_minimal) + + # Configure the detailed hover tool + hover_detailed = HoverTool(tooltips=[ + ('Protein ID', '@ProteinId'), + ('Precursor m/z', '@PrecursorMz{0.4}'), + ('Precursor Charge', '@PrecursorCharge'), + ('Normalized Retention Time', '@NormalizedRetentionTime{0.2}'), + ('Precursor Ion Mobility', '@PrecursorIonMobility{0.6}'), + ('Filename', '''
@filename
''') + ], name="Detailed Hover") + p.add_tools(hover_detailed) + + # Add a legend for the filename + legend = Legend(items=legend_it) + legend.click_policy="mute" + legend.label_text_font_size = '8pt' + p.add_layout(legend, 'right') + + # Format the tick labels to remove scientific notation + p.xaxis.formatter = PrintfTickFormatter(format='%.2f') + p.yaxis.formatter = PrintfTickFormatter(format='%.2f') + return p \ No newline at end of file From b38a726f718c3bed5c0ff27ff2df4ba65b02bddd Mon Sep 17 00:00:00 2001 From: singjc Date: Sun, 2 Jun 2024 01:47:40 -0400 Subject: [PATCH 4/7] add: mz and im debug files --- massdash/loaders/TrafoXMLLoader.py | 21 ++++++++++++++---- massdash/loaders/access/TrafoXMLAccess.py | 27 +++++++++++++++++++++-- 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/massdash/loaders/TrafoXMLLoader.py b/massdash/loaders/TrafoXMLLoader.py index 622af81..4d01835 100644 --- a/massdash/loaders/TrafoXMLLoader.py +++ b/massdash/loaders/TrafoXMLLoader.py @@ -9,7 +9,11 @@ from .access.TrafoXMLAccess import TrafoXMLAccess class TrafoXMLLoader: - def __init__(self, dataFiles: Union[str, List[str]], libraryFile: Union[str, List[str]] = None): + def __init__(self, + dataFiles: Union[str, List[str]], + libraryFile: Union[str, List[str]] = None, + mzDebugFile: Union[str, List[str]] = None, + imDebugFile: Union[str, List[str]] = None): ## store the file names if isinstance(dataFiles, str): self.dataFiles_str = [dataFiles] @@ -20,12 +24,21 @@ def __init__(self, dataFiles: Union[str, List[str]], libraryFile: Union[str, Lis self.libraryFile_str = [libraryFile] else: self.libraryFile_str = libraryFile - print(f"len libraryFile: {len(self.libraryFile_str)}") + + if isinstance(mzDebugFile, str): + self.mzDebugFile_str = [mzDebugFile] + else: + self.mzDebugFile_str = mzDebugFile + + if isinstance(imDebugFile, str): + self.imDebugFile_str = [imDebugFile] + else: + self.imDebugFile_str = imDebugFile if self.libraryFile_str is not None and len(self.libraryFile_str) > 1: - self.dataFiles = [TrafoXMLAccess(f, l) for f, l in zip(self.dataFiles_str, self.libraryFile_str)] + self.dataFiles = [TrafoXMLAccess(f, l, mz_f, im_f) for f, l, mz_f, im_f in zip(self.dataFiles_str, self.libraryFile_str, self.mzDebugFile_str, self.imDebugFile_str)] elif self.libraryFile_str is not None and len(self.libraryFile_str) == 1: - self.dataFiles = [TrafoXMLAccess(f, self.libraryFile_str[0]) for f in self.dataFiles_str] + self.dataFiles = [TrafoXMLAccess(f, self.libraryFile_str[0], mz_f, im_f) for f, mz_f, im_f in zip(self.dataFiles_str, self.mzDebugFile_str, self.imDebugFile_str)] def __str__(self): return f"TrafoXMLLoader(dataFiles={self.dataFiles_str}, libraryFile={self.libraryFile_str}" diff --git a/massdash/loaders/access/TrafoXMLAccess.py b/massdash/loaders/access/TrafoXMLAccess.py index d93555b..60d1fe1 100644 --- a/massdash/loaders/access/TrafoXMLAccess.py +++ b/massdash/loaders/access/TrafoXMLAccess.py @@ -33,15 +33,27 @@ class TrafoXMLAccess: load_pairs_df: Loads the transformation pairs as a pandas DataFrame. """ - def __init__(self, input_file: str, irt_library: str = None) -> None: + def __init__(self, + input_file: str, + irt_library: str = None, + mzDebugFile: str = None, + imDebugFile: str = None) -> None: self.input_file_str = input_file self.tree = ET.parse(self.input_file_str) self.root = self.tree.getroot() self.irt_library_str = irt_library + self.mzDebugFile_str = mzDebugFile + self.imDebugFile_str = imDebugFile if self.irt_library_str is not None: self.irt_library = SpectralLibraryLoader(self.irt_library_str) self.irt_library.load() + + if self.mzDebugFile_str is not None: + self.mzDebugFile = pd.read_csv(self.mzDebugFile_str, sep='\t') + + if self.imDebugFile_str is not None: + self.imDebugFile = pd.read_csv(self.imDebugFile_str, sep='\t') def load_transformation_params(self) -> Dict: """ @@ -81,9 +93,20 @@ def load_pairs_df(self) -> pd.DataFrame: # Add irt precursor information to table if irt_library is available if self.irt_library_str is not None: irt_prec_meta = self.irt_library.data[['GeneName', 'ProteinId', 'ModifiedPeptideSequence', - 'PrecursorMz', 'PrecursorCharge', 'NormalizedRetentionTime', + 'PrecursorMz', 'PrecursorCharge', 'ProductMz', 'ProductCharge', 'Annotation', 'NormalizedRetentionTime', 'PrecursorIonMobility']].drop_duplicates() df = pd.merge(df, irt_prec_meta, left_on='library_rt', right_on='NormalizedRetentionTime', how='inner') + + if self.mzDebugFile_str is not None: + df = pd.merge(df, self.mzDebugFile, left_on='experiment_rt', right_on='RT', how='inner') + # Drop RT column, since it's the same as experiment_rt + df = df.drop(columns=['RT']) + + if self.imDebugFile_str is not None: + df = pd.merge(df, self.imDebugFile[['RT', 'im', 'theo_im', 'intensity']], left_on='experiment_rt', right_on='RT', how='inner') + # Drop RT column, since it's the same as experiment_rt + df = df.drop(columns=['RT']) + df['filename'] = basename(self.input_file_str).split('.')[0] return df From bb9715e4c94f42899df59acf91c308a62952f750 Mon Sep 17 00:00:00 2001 From: singjc Date: Sun, 2 Jun 2024 01:48:10 -0400 Subject: [PATCH 5/7] refactor: plotting for generalized scatter plot based on df --- massdash/plotting/DebugPlotter.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/massdash/plotting/DebugPlotter.py b/massdash/plotting/DebugPlotter.py index 44a7adb..12d81c0 100644 --- a/massdash/plotting/DebugPlotter.py +++ b/massdash/plotting/DebugPlotter.py @@ -16,10 +16,10 @@ class DebugPlotter: def __init__(self): self.fig = None - def plot(self, df): + def plot(self, df, x_col, y_col, title, x_axis_label, y_axis_label): # Create a new plot - p = figure(title='Retention time transformation', x_axis_label='original RT [s]', y_axis_label='Delta RT [s]', - tools=['pan', 'wheel_zoom', 'box_zoom', 'reset', 'save']) + p = figure(title=title, x_axis_label=x_axis_label, y_axis_label=y_axis_label, + tools=['pan', 'wheel_zoom', 'box_zoom', 'reset', 'save']) unique_filenames = df['filename'].unique() colors = Category20[len(unique_filenames)] @@ -27,19 +27,18 @@ def plot(self, df): legend_it = [] file_number = 1 for filename, grouped_df in df.groupby('filename'): - - color = colors[file_number-1] - print(f"File {file_number}: {filename} | color: {color}") + color = colors[file_number - 1] + print(f"File {file_number}: {filename}") # Add the scatter plot source = ColumnDataSource(grouped_df) - renderer = p.scatter('experiment_rt', 'library_rt', source=source, size=10, alpha=0.5, color=color) + renderer = p.scatter(x_col, y_col, source=source, size=10, alpha=0.5, color=color) legend_it.append((f"File {file_number}", [renderer])) file_number += 1 # Configure the minimal hover tool hover_minimal = HoverTool(tooltips=[ - ('original RT', '@experiment_rt{0.0}'), - ('Delta RT', '@library_rt{0.0}'), + (x_axis_label, f'@{x_col}{{0.0}}'), + (y_axis_label, f'@{y_col}{{0.0}}'), ('Peptide Sequence', '@ModifiedPeptideSequence') ], name="Minimal Hover") p.add_tools(hover_minimal) @@ -57,7 +56,7 @@ def plot(self, df): # Add a legend for the filename legend = Legend(items=legend_it) - legend.click_policy="mute" + legend.click_policy = "mute" legend.label_text_font_size = '8pt' p.add_layout(legend, 'right') From fedabf7761ff922ee33653f35d576715708f8971 Mon Sep 17 00:00:00 2001 From: singjc Date: Sun, 2 Jun 2024 01:59:56 -0400 Subject: [PATCH 6/7] add: plot method to loader --- massdash/loaders/TrafoXMLLoader.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/massdash/loaders/TrafoXMLLoader.py b/massdash/loaders/TrafoXMLLoader.py index 4d01835..9326095 100644 --- a/massdash/loaders/TrafoXMLLoader.py +++ b/massdash/loaders/TrafoXMLLoader.py @@ -46,4 +46,29 @@ def __str__(self): def __repr__(self): return f"TrafoXMLLoader(dataFiles={self.dataFiles_str}, libraryFile={self.libraryFile_str}" - \ No newline at end of file + + def plot(self, debug_plot_type = "rt", split: bool = False): + import pandas as pd + from bokeh.plotting import show + + from massdash.plotting.DebugPlotter import DebugPlotter + + debug_plot_type_map = { + "rt": ['experiment_rt', 'library_rt', 'Retention Time Transformation', 'Original RT [s]', 'Delta RT [s]'], + "mz": ['mz', 'theo_mz', 'm/z calibration', 'Experiment m/z', 'Theoretical m/z'], + "im": ['im', 'theo_im', 'Ion mobility calibration', 'Experiment Ion Mobility', 'Theoretical Ion Mobility'] + } + + if split: + for i in range(len(self.dataFiles)): + df = self.dataFiles[i].load_pairs_df() + plotter = DebugPlotter() + p = plotter.plot(df, *debug_plot_type_map[debug_plot_type]) + show(p) + else: + df = [self.dataFiles[i].load_pairs_df() for i in range(len(self.dataFiles))] + df = pd.concat(df) + + plotter = DebugPlotter() + p = plotter.plot(df, *debug_plot_type_map[debug_plot_type]) + show(p) \ No newline at end of file From 0b06fe9a094eb86deaaca8d09c095a90060c1ecc Mon Sep 17 00:00:00 2001 From: singjc Date: Sun, 2 Jun 2024 02:00:21 -0400 Subject: [PATCH 7/7] fix: if only one file, use default blue color --- massdash/plotting/DebugPlotter.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/massdash/plotting/DebugPlotter.py b/massdash/plotting/DebugPlotter.py index 12d81c0..f6eb6f7 100644 --- a/massdash/plotting/DebugPlotter.py +++ b/massdash/plotting/DebugPlotter.py @@ -22,7 +22,12 @@ def plot(self, df, x_col, y_col, title, x_axis_label, y_axis_label): tools=['pan', 'wheel_zoom', 'box_zoom', 'reset', 'save']) unique_filenames = df['filename'].unique() - colors = Category20[len(unique_filenames)] + if len(unique_filenames) == 1: + colors = ['blue'] + elif len(unique_filenames) <= 20: + colors = Category20[len(unique_filenames)] + else: + raise ValueError("Too many files to plot (>20), not enought colors available") legend_it = [] file_number = 1