Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DRAFT] Feature/osw debug files #136

Open
wants to merge 7 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions massdash/loaders/TrafoXMLLoader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
"""
massdash/loaders/TrafoXMLLoader
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
"""

from typing import List, Dict, Union

# Access
from .access.TrafoXMLAccess import TrafoXMLAccess

class TrafoXMLLoader:
def __init__(self,
dataFiles: Union[str, List[str]],
libraryFile: Union[str, List[str]] = None,
mzDebugFile: Union[str, List[str]] = None,
imDebugFile: Union[str, List[str]] = None):
## store the file names
if isinstance(dataFiles, str):
self.dataFiles_str = [dataFiles]
else:
self.dataFiles_str = dataFiles

if isinstance(libraryFile, str):
self.libraryFile_str = [libraryFile]
else:
self.libraryFile_str = libraryFile

if isinstance(mzDebugFile, str):
self.mzDebugFile_str = [mzDebugFile]
else:
self.mzDebugFile_str = mzDebugFile

if isinstance(imDebugFile, str):
self.imDebugFile_str = [imDebugFile]
else:
self.imDebugFile_str = imDebugFile

if self.libraryFile_str is not None and len(self.libraryFile_str) > 1:
self.dataFiles = [TrafoXMLAccess(f, l, mz_f, im_f) for f, l, mz_f, im_f in zip(self.dataFiles_str, self.libraryFile_str, self.mzDebugFile_str, self.imDebugFile_str)]
elif self.libraryFile_str is not None and len(self.libraryFile_str) == 1:
self.dataFiles = [TrafoXMLAccess(f, self.libraryFile_str[0], mz_f, im_f) for f, mz_f, im_f in zip(self.dataFiles_str, self.mzDebugFile_str, self.imDebugFile_str)]

def __str__(self):
return f"TrafoXMLLoader(dataFiles={self.dataFiles_str}, libraryFile={self.libraryFile_str}"

def __repr__(self):
return f"TrafoXMLLoader(dataFiles={self.dataFiles_str}, libraryFile={self.libraryFile_str}"


def plot(self, debug_plot_type = "rt", split: bool = False):
import pandas as pd
from bokeh.plotting import show

from massdash.plotting.DebugPlotter import DebugPlotter

debug_plot_type_map = {
"rt": ['experiment_rt', 'library_rt', 'Retention Time Transformation', 'Original RT [s]', 'Delta RT [s]'],
"mz": ['mz', 'theo_mz', 'm/z calibration', 'Experiment m/z', 'Theoretical m/z'],
"im": ['im', 'theo_im', 'Ion mobility calibration', 'Experiment Ion Mobility', 'Theoretical Ion Mobility']
}

if split:
for i in range(len(self.dataFiles)):
df = self.dataFiles[i].load_pairs_df()
plotter = DebugPlotter()
p = plotter.plot(df, *debug_plot_type_map[debug_plot_type])
show(p)
else:
df = [self.dataFiles[i].load_pairs_df() for i in range(len(self.dataFiles))]
df = pd.concat(df)

plotter = DebugPlotter()
p = plotter.plot(df, *debug_plot_type_map[debug_plot_type])
show(p)
112 changes: 112 additions & 0 deletions massdash/loaders/access/TrafoXMLAccess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""
massdash/loaders/access/TrafoXMLAccess
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
"""

from typing import List, Dict, Tuple
from os.path import basename
import xml.etree.ElementTree as ET
import pandas as pd


# Library Access
from ..SpectralLibraryLoader import SpectralLibraryLoader

class TrafoXMLAccess:
"""
A class for accessing and loading data from a TrafoXML file.

Args:
input_file (str): The path to the TrafoXML file.
irt_library (str, optional): The path to the IRT library file. Defaults to None.

Attributes:
input_file_str (str): The path to the TrafoXML file.
tree (ElementTree): The parsed XML tree from the TrafoXML file.
root (Element): The root element of the XML tree.
irt_library_str (str): The path to the IRT library file.
irt_library (SpectralLibraryLoader): The loaded IRT library.

Methods:
load_transformation_params: Loads the transformation parameters from the TrafoXML file.
load_pairs: Loads the transformation pairs from the TrafoXML file.
load_pairs_df: Loads the transformation pairs as a pandas DataFrame.
"""

def __init__(self,
input_file: str,
irt_library: str = None,
mzDebugFile: str = None,
imDebugFile: str = None) -> None:
self.input_file_str = input_file
self.tree = ET.parse(self.input_file_str)
self.root = self.tree.getroot()
self.irt_library_str = irt_library
self.mzDebugFile_str = mzDebugFile
self.imDebugFile_str = imDebugFile

if self.irt_library_str is not None:
self.irt_library = SpectralLibraryLoader(self.irt_library_str)
self.irt_library.load()

if self.mzDebugFile_str is not None:
self.mzDebugFile = pd.read_csv(self.mzDebugFile_str, sep='\t')

if self.imDebugFile_str is not None:
self.imDebugFile = pd.read_csv(self.imDebugFile_str, sep='\t')

def load_transformation_params(self) -> Dict:
"""
Loads the transformation parameters from the TrafoXML file.

Returns:
dict: A dictionary containing the transformation parameters.

"""
transformation = self.root.find('Transformation')
params = {param.attrib['name']: param.attrib['value'] for param in transformation.findall('Param')}
return params

def load_pairs(self) -> List[Tuple[float, float]]:
"""
Loads the transformation pairs from the TrafoXML file.

Returns:
List[Tuple[float, float]]: A list of tuples representing the transformation pairs.

"""
transformation = self.root.find('Transformation')
pairs = [(float(pair.attrib['from']), float(pair.attrib['to'])) for pair in transformation.find('Pairs')]
return pairs

def load_pairs_df(self) -> pd.DataFrame:
"""
Loads the transformation pairs as a pandas DataFrame.

Returns:
pd.DataFrame: A DataFrame containing the transformation pairs.

"""
pairs = self.load_pairs()
df = pd.DataFrame(pairs, columns=['experiment_rt', 'library_rt'])

# Add irt precursor information to table if irt_library is available
if self.irt_library_str is not None:
irt_prec_meta = self.irt_library.data[['GeneName', 'ProteinId', 'ModifiedPeptideSequence',
'PrecursorMz', 'PrecursorCharge', 'ProductMz', 'ProductCharge', 'Annotation', 'NormalizedRetentionTime',
'PrecursorIonMobility']].drop_duplicates()
df = pd.merge(df, irt_prec_meta, left_on='library_rt', right_on='NormalizedRetentionTime', how='inner')

if self.mzDebugFile_str is not None:
df = pd.merge(df, self.mzDebugFile, left_on='experiment_rt', right_on='RT', how='inner')
# Drop RT column, since it's the same as experiment_rt
df = df.drop(columns=['RT'])

if self.imDebugFile_str is not None:
df = pd.merge(df, self.imDebugFile[['RT', 'im', 'theo_im', 'intensity']], left_on='experiment_rt', right_on='RT', how='inner')
# Drop RT column, since it's the same as experiment_rt
df = df.drop(columns=['RT'])

df['filename'] = basename(self.input_file_str).split('.')[0]

return df
71 changes: 71 additions & 0 deletions massdash/plotting/DebugPlotter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""
massdash/plotting/GenericPlotter
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
"""

from typing import List, Optional, Literal

# Plotting modules
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, PrintfTickFormatter, LegendItem, Legend
from bokeh.palettes import Category20



class DebugPlotter:
def __init__(self):
self.fig = None

def plot(self, df, x_col, y_col, title, x_axis_label, y_axis_label):
# Create a new plot
p = figure(title=title, x_axis_label=x_axis_label, y_axis_label=y_axis_label,
tools=['pan', 'wheel_zoom', 'box_zoom', 'reset', 'save'])

unique_filenames = df['filename'].unique()
if len(unique_filenames) == 1:
colors = ['blue']
elif len(unique_filenames) <= 20:
colors = Category20[len(unique_filenames)]
else:
raise ValueError("Too many files to plot (>20), not enought colors available")

legend_it = []
file_number = 1
for filename, grouped_df in df.groupby('filename'):
color = colors[file_number - 1]
print(f"File {file_number}: {filename}")
# Add the scatter plot
source = ColumnDataSource(grouped_df)
renderer = p.scatter(x_col, y_col, source=source, size=10, alpha=0.5, color=color)
legend_it.append((f"File {file_number}", [renderer]))
file_number += 1

# Configure the minimal hover tool
hover_minimal = HoverTool(tooltips=[
(x_axis_label, f'@{x_col}{{0.0}}'),
(y_axis_label, f'@{y_col}{{0.0}}'),
('Peptide Sequence', '@ModifiedPeptideSequence')
], name="Minimal Hover")
p.add_tools(hover_minimal)

# Configure the detailed hover tool
hover_detailed = HoverTool(tooltips=[
('Protein ID', '@ProteinId'),
('Precursor m/z', '@PrecursorMz{0.4}'),
('Precursor Charge', '@PrecursorCharge'),
('Normalized Retention Time', '@NormalizedRetentionTime{0.2}'),
('Precursor Ion Mobility', '@PrecursorIonMobility{0.6}'),
('Filename', '''<div style="width:200px; word-wrap:break-word;">@filename</div>''')
], name="Detailed Hover")
p.add_tools(hover_detailed)

# Add a legend for the filename
legend = Legend(items=legend_it)
legend.click_policy = "mute"
legend.label_text_font_size = '8pt'
p.add_layout(legend, 'right')

# Format the tick labels to remove scientific notation
p.xaxis.formatter = PrintfTickFormatter(format='%.2f')
p.yaxis.formatter = PrintfTickFormatter(format='%.2f')
return p
Loading