Merge pull request #233 from theGreatHerrLebert/david@simulation

David@simulation
theGreatHerrLebert · Jul 17, 2024 · 0b8cad2 · 0b8cad2
2 parents 8954eb4 + 5098bd8
commit 0b8cad2
Show file tree

Hide file tree

Showing 9 changed files with 163 additions and 25 deletions.
diff --git a/imspy/README.md b/imspy/README.md
@@ -0,0 +1,121 @@
+# imspy - Python package for working with timsTOF raw data
+
+
+
+## Raw data access
+
+### Establish a connection to a timsTOF raw file and access data
+
+```python
+import numpy as np
+from imspy.timstof import TimsDataset
+
+# you can use in-memory mode for faster access, but it requires more memory
+tdf = TimsDataset("path/to/rawfolder.d", in_memory=False)
+
+# show global meta data table
+print(tdf.global_meta_data)
+
+# show frame meta data
+print(tdf.meta_data)
+
+# get the first frame (bruker frame indices start at 1)
+frame = tdf.get_tims_frame(1)
+
+# you can also use indexing
+frame = tdf[1]
+
+# print data as pandas dataframe
+frame.df()
+
+# get all spectra in a tims frame (sorted by scan = ion mobility)
+spectra = frame.to_tims_spectra()
+
+# get a slice of multiple frames
+frames = tdf.get_tims_slice(np.array([1, 2, 3]))
+
+# or, by using slicing
+frames = tdf[1:4]
+```
+
+### DDA data
+
+```python
+from imspy.timstof import TimsDatasetDDA
+# read a DDA dataset
+tdf = TimsDatasetDDA("path/to/rawfolder.d", in_memory=False)
+
+# get raw data of precursors together with their fragment ions
+dda_fragments = tdf.get_pasef_fragments()
+
+# the timsTOF re-fragments precursors below a certain intensity threshold,
+# you can aggregate the data for increased sensitivity like so:
+dda_fragments_grouped = dda_fragments.groupby('precursor_id').agg({
+    'frame_id': 'first',
+    'time': 'first',
+    'precursor_id': 'first',
+    # this will sum up the raw data of all fragments with the same precursor_id
+    'raw_data': 'sum',
+    'scan_begin': 'first',
+    'scan_end': 'first',
+    'isolation_mz': 'first',
+    'isolation_width': 'first',
+    'collision_energy': 'first',
+    'largest_peak_mz': 'first',
+    'average_mz': 'first',
+    'monoisotopic_mz': 'first',
+    'charge': 'first',
+    'average_scan': 'first',
+    'intensity': 'first',
+    'parent_id': 'first',
+})
+
+# for convenience, you can calculate the inverse mobility 
+# of the precursor ion by finding the maximum intensity along the scan dimension
+mobility = dda_fragments_grouped.apply(
+    lambda r: r.raw_data.get_inverse_mobility_along_scan_marginal(), axis=1
+)
+
+# add the inverse mobility to the grouped data as a new column
+dda_fragments_grouped['mobility'] = mobility
+```
+
+### DIA data
+
+```python
+from imspy.timstof import TimsDatasetDIA
+# read a DIA dataset
+tdf = TimsDatasetDIA("path/to/rawfolder.d", in_memory=False)
+```
+
+## The chemistry module
+
+### Basic usage
+```python
+```
+
+### Working with peptide sequences
+```python
+```
+
+## Algorithms and machine learning
+
+### ion mobility and retention time prediction
+```python
+```
+
+### Locality sensitive hashing
+```python
+```
+
+### Mixture models
+```python
+```
+
+## Pipeline: DDA data analysis (imspy_dda)
+```python
+```
+
+## Pipeline: Synthetic raw data generation (timsim)
+```python
+```
diff --git a/imspy/imspy/simulation/resources/configs/dilution_factors.csv b/imspy/imspy/simulation/resources/configs/dilution_factors.csv
@@ -1,4 +1,4 @@
 proteome,dilution_factor
-"HUMAN.fasta",1.0
-"YEAST.fasta",0.33
-"ECOLI.fasta",0.11
+"HUMAN.fasta",0.65
+"YEAST.fasta",0.30
+"ECOLI.fasta",0.05
diff --git a/imspy/imspy/simulation/timsim/jobs/build_acquisition.py b/imspy/imspy/simulation/timsim/jobs/build_acquisition.py
@@ -1,6 +1,6 @@
 from imspy.simulation.acquisition import TimsTofAcquisitionBuilderDIA
 from imspy.simulation.utility import read_acquisition_config
-from imspy.timstof import TimsDataset, TimsDatasetDIA
+from imspy.timstof import TimsDatasetDIA
 
 
 def build_acquisition(

diff --git a/imspy/imspy/simulation/timsim/jobs/simulate_charge_states.py b/imspy/imspy/simulation/timsim/jobs/simulate_charge_states.py
@@ -12,6 +12,7 @@ def simulate_charge_states(
         mz_upper: float,
         p_charge: float = 0.5,
         charge_state_one_probability: float = 0.0,
+        min_charge_contrib: float = 0.15,
 ) -> pd.DataFrame:
 
     IonSource = DeepChargeStateDistribution(
@@ -20,7 +21,11 @@ def simulate_charge_states(
     )
 
     # IonSource = BinomialChargeStateDistributionModel(charged_probability=p_charge)
-    peptide_ions = IonSource.simulate_charge_state_distribution_pandas(peptides, charge_state_one_probability=charge_state_one_probability)
+    peptide_ions = IonSource.simulate_charge_state_distribution_pandas(
+        peptides,
+        charge_state_one_probability=charge_state_one_probability,
+        min_charge_contrib=min_charge_contrib,
+    )
 
     # merge tables to have sequences with ions, remove mz values outside scope
     ions = pd.merge(left=peptide_ions, right=peptides, left_on=['peptide_id'], right_on=['peptide_id'])

diff --git a/imspy/imspy/simulation/timsim/jobs/simulate_retention_time.py b/imspy/imspy/simulation/timsim/jobs/simulate_retention_time.py
@@ -1,6 +1,7 @@
 import pandas as pd
 
-from imspy.algorithm import DeepChromatographyApex, load_tokenizer_from_resources, load_deep_retention_time_predictor
+from imspy.algorithm.rt.predictors import DeepChromatographyApex, load_deep_retention_time_predictor
+from imspy.algorithm.utility import load_tokenizer_from_resources
 
 
 def simulate_retention_times(

diff --git a/imspy/imspy/simulation/timsim/simulator.py b/imspy/imspy/simulation/timsim/simulator.py
@@ -72,15 +72,19 @@ def main():
                         help="Use the layout of the reference dataset for the acquisition (default: True)")
     parser.set_defaults(use_reference_layout=True)
 
+    parser.add_argument("--no_peptide_sampling", dest="sample_peptides", action="store_false",
+                        help="Sample peptides from the digested fasta (default: True)")
+    parser.set_defaults(sample_peptides=True)
+
     # Peptide digestion arguments
     parser.add_argument(
-        "--sample_fraction",
-        type=float,
-        default=0.005,
+        "--num_sample_peptides",
+        type=int,
+        default=25_000,
         help="Sample fraction, fraction of peptides to be sampled at random from digested fasta (default: 0.005)")
 
     parser.add_argument("--missed_cleavages", type=int, default=2, help="Number of missed cleavages (default: 2)")
-    parser.add_argument("--min_len", type=int, default=9, help="Minimum peptide length (default: 7)")
+    parser.add_argument("--min_len", type=int, default=7, help="Minimum peptide length (default: 7)")
     parser.add_argument("--max_len", type=int, default=30, help="Maximum peptide length (default: 30)")
     parser.add_argument("--cleave_at", type=str, default='KR', help="Cleave at (default: KR)")
     parser.add_argument("--restrict", type=str, default='P', help="Restrict (default: P)")
@@ -134,11 +138,13 @@ def main():
                         help="Sampling step size for frame distributions (default: 0.001)")
 
     # Number of cores to use
-    parser.add_argument("--num_threads", type=int, default=16, help="Number of threads to use (default: 16)")
+    parser.add_argument("--num_threads", type=int, default=-1, help="Number of threads to use (default: -1, all available)")
     parser.add_argument("--batch_size", type=int, default=256, help="Batch size (default: 256)")
 
     # charge state probabilities
     parser.add_argument("--p_charge", type=float, default=0.5, help="Probability of being charged (default: 0.5)")
+    parser.add_argument("--min_charge_contrib", type=float, default=0.15,
+                        help="Minimum charge contribution (default: 0.15)")
 
     # Noise settings
     # -- 1. RT and IM noise
@@ -207,7 +213,7 @@ def main():
         action="store_true",
         dest="proteome_mix",
     )
-    parser.set_defaults(proteome_mixture=False)
+    parser.set_defaults(proteome_mix=False)
 
     # Parse the arguments
     args = parser.parse_args()
@@ -221,15 +227,15 @@ def main():
     # Print table
     print(tabulate(table, headers=["Argument", "Value"], tablefmt="grid"))
 
-    # Save the arguments to a file
-    with open(os.path.join(args.path, 'arguments.txt'), 'w') as f:
-        f.write(tabulate(table, headers=["Argument", "Value"], tablefmt="grid"))
-
     # Use the arguments
     path = check_path(args.path)
     reference_path = check_path(args.reference_path)
     name = args.name.replace('[PLACEHOLDER]', f'{args.acquisition_type}').replace("'", "")
 
+    # Save the arguments to a file, should go into the database folder
+    with open(os.path.join(path, f'arguments-{name}.txt'), 'w') as f:
+        f.write(tabulate(table, headers=["Argument", "Value"], tablefmt="grid"))
+
     if args.proteome_mix:
         factors = get_dilution_factors()
 
@@ -307,8 +313,8 @@ def main():
 
     peptides = pd.concat(peptide_list)
 
-    if args.sample_fraction < 1.0:
-        peptides = peptides.sample(frac=args.sample_fraction)
+    if args.sample_peptides:
+        peptides = peptides.sample(n=args.num_sample_peptides, random_state=41)
         peptides.reset_index(drop=True, inplace=True)
 
     if verbose:
@@ -328,6 +334,10 @@ def main():
     columns[-2], columns[-1] = columns[-1], columns[-2]
     peptides = peptides[columns]
 
+    # get the number of available threads of the system if not specified
+    if args.num_threads == -1:
+        args.num_threads = os.cpu_count()
+
     # JOB 4: Simulate frame distributions emg
     peptides = simulate_frame_distributions_emg(
         peptides=peptides,
@@ -355,7 +365,8 @@ def main():
         peptides=peptides,
         mz_lower=acquisition_builder.tdf_writer.helper_handle.mz_lower,
         mz_upper=acquisition_builder.tdf_writer.helper_handle.mz_upper,
-        p_charge=p_charge
+        p_charge=p_charge,
+        min_charge_contrib=args.min_charge_contrib,
     )
 
     # JOB 6: Simulate ion mobilities

diff --git a/imspy/imspy/timstof/data.py b/imspy/imspy/timstof/data.py
@@ -59,7 +59,7 @@ def get_py_ptr(self):
 
 
 class TimsDataset(ABC):
-    def __init__(self, data_path: str, in_memory: bool = True):
+    def __init__(self, data_path: str, in_memory: bool = False):
         """TimsDataHandle class.
 
         Args:

diff --git a/imspy/imspy/timstof/dbsearch/imspy_dda.py b/imspy/imspy/timstof/dbsearch/imspy_dda.py
@@ -258,7 +258,7 @@ def main():
     parser.add_argument("--refinement_verbose", dest="refinement_verbose", action="store_true", help="Refinement verbose")
     parser.set_defaults(refinement_verbose=False)
 
-    parser.add_argument("--score", type=str, default="hyper_score", help="Score type (default: hyper_score)")
+    parser.add_argument("--rescore_score", type=str, default="hyper_score", help="Score type to be used for re-scoring (default: hyper_score)")
 
     args = parser.parse_args()
 
@@ -305,7 +305,7 @@ def main():
     start_time = time.time()
 
     scores = ["hyper_score", "beta_score"]
-    assert args.score in scores, f"Score type {args.score} not supported. Supported score types are: {scores}"
+    assert args.rescore_score in scores, f"Score type {args.rescore_score} not supported. Supported score types are: {scores}"
 
     if args.verbose:
         print(f"found {len(paths)} RAW data folders in {args.path} ...")
@@ -703,7 +703,7 @@ def main():
     psms = list(sorted(psms, key=lambda psm: (psm.spec_idx, psm.peptide_idx)))
 
     psms = re_score_psms(psms=psms, verbose=args.verbose, num_splits=args.re_score_num_splits,
-                         balance=args.balanced_re_score, score=args.score)
+                         balance=args.balanced_re_score, score=args.rescore_score)
 
     # serialize all PSMs to JSON binary
     bts = psms_to_json_bin(psms)

diff --git a/imspy/imspy/timstof/dda.py b/imspy/imspy/timstof/dda.py
@@ -59,8 +59,8 @@ def _load_pasef_meta_data(self):
     def get_pasef_fragments(self, num_threads: int = 1) -> pd.DataFrame:
         """Get PASEF fragments.
 
-        Args:
-            num_threads (int, optional): Number of threads. Defaults to 1.
+        Args: num_threads (int, optional): Number of threads. Defaults to 1. CAUTION: As long as connection to
+        datasets is established via bruker so / dll, using multiple threads is unstable.
 
         Returns:
             List[FragmentDDA]: List of PASEF fragments.