Skip to content

Commit

Permalink
Merge pull request #233 from theGreatHerrLebert/david@simulation
Browse files Browse the repository at this point in the history
David@simulation
  • Loading branch information
theGreatHerrLebert authored Jul 17, 2024
2 parents 8954eb4 + 5098bd8 commit 0b8cad2
Show file tree
Hide file tree
Showing 9 changed files with 163 additions and 25 deletions.
121 changes: 121 additions & 0 deletions imspy/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# imspy - Python package for working with timsTOF raw data



## Raw data access

### Establish a connection to a timsTOF raw file and access data

```python
import numpy as np
from imspy.timstof import TimsDataset

# you can use in-memory mode for faster access, but it requires more memory
tdf = TimsDataset("path/to/rawfolder.d", in_memory=False)

# show global meta data table
print(tdf.global_meta_data)

# show frame meta data
print(tdf.meta_data)

# get the first frame (bruker frame indices start at 1)
frame = tdf.get_tims_frame(1)

# you can also use indexing
frame = tdf[1]

# print data as pandas dataframe
frame.df()

# get all spectra in a tims frame (sorted by scan = ion mobility)
spectra = frame.to_tims_spectra()

# get a slice of multiple frames
frames = tdf.get_tims_slice(np.array([1, 2, 3]))

# or, by using slicing
frames = tdf[1:4]
```

### DDA data

```python
from imspy.timstof import TimsDatasetDDA
# read a DDA dataset
tdf = TimsDatasetDDA("path/to/rawfolder.d", in_memory=False)

# get raw data of precursors together with their fragment ions
dda_fragments = tdf.get_pasef_fragments()

# the timsTOF re-fragments precursors below a certain intensity threshold,
# you can aggregate the data for increased sensitivity like so:
dda_fragments_grouped = dda_fragments.groupby('precursor_id').agg({
'frame_id': 'first',
'time': 'first',
'precursor_id': 'first',
# this will sum up the raw data of all fragments with the same precursor_id
'raw_data': 'sum',
'scan_begin': 'first',
'scan_end': 'first',
'isolation_mz': 'first',
'isolation_width': 'first',
'collision_energy': 'first',
'largest_peak_mz': 'first',
'average_mz': 'first',
'monoisotopic_mz': 'first',
'charge': 'first',
'average_scan': 'first',
'intensity': 'first',
'parent_id': 'first',
})

# for convenience, you can calculate the inverse mobility
# of the precursor ion by finding the maximum intensity along the scan dimension
mobility = dda_fragments_grouped.apply(
lambda r: r.raw_data.get_inverse_mobility_along_scan_marginal(), axis=1
)

# add the inverse mobility to the grouped data as a new column
dda_fragments_grouped['mobility'] = mobility
```

### DIA data

```python
from imspy.timstof import TimsDatasetDIA
# read a DIA dataset
tdf = TimsDatasetDIA("path/to/rawfolder.d", in_memory=False)
```

## The chemistry module

### Basic usage
```python
```

### Working with peptide sequences
```python
```

## Algorithms and machine learning

### ion mobility and retention time prediction
```python
```

### Locality sensitive hashing
```python
```

### Mixture models
```python
```

## Pipeline: DDA data analysis (imspy_dda)
```python
```

## Pipeline: Synthetic raw data generation (timsim)
```python
```
6 changes: 3 additions & 3 deletions imspy/imspy/simulation/resources/configs/dilution_factors.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
proteome,dilution_factor
"HUMAN.fasta",1.0
"YEAST.fasta",0.33
"ECOLI.fasta",0.11
"HUMAN.fasta",0.65
"YEAST.fasta",0.30
"ECOLI.fasta",0.05
2 changes: 1 addition & 1 deletion imspy/imspy/simulation/timsim/jobs/build_acquisition.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from imspy.simulation.acquisition import TimsTofAcquisitionBuilderDIA
from imspy.simulation.utility import read_acquisition_config
from imspy.timstof import TimsDataset, TimsDatasetDIA
from imspy.timstof import TimsDatasetDIA


def build_acquisition(
Expand Down
7 changes: 6 additions & 1 deletion imspy/imspy/simulation/timsim/jobs/simulate_charge_states.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def simulate_charge_states(
mz_upper: float,
p_charge: float = 0.5,
charge_state_one_probability: float = 0.0,
min_charge_contrib: float = 0.15,
) -> pd.DataFrame:

IonSource = DeepChargeStateDistribution(
Expand All @@ -20,7 +21,11 @@ def simulate_charge_states(
)

# IonSource = BinomialChargeStateDistributionModel(charged_probability=p_charge)
peptide_ions = IonSource.simulate_charge_state_distribution_pandas(peptides, charge_state_one_probability=charge_state_one_probability)
peptide_ions = IonSource.simulate_charge_state_distribution_pandas(
peptides,
charge_state_one_probability=charge_state_one_probability,
min_charge_contrib=min_charge_contrib,
)

# merge tables to have sequences with ions, remove mz values outside scope
ions = pd.merge(left=peptide_ions, right=peptides, left_on=['peptide_id'], right_on=['peptide_id'])
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd

from imspy.algorithm import DeepChromatographyApex, load_tokenizer_from_resources, load_deep_retention_time_predictor
from imspy.algorithm.rt.predictors import DeepChromatographyApex, load_deep_retention_time_predictor
from imspy.algorithm.utility import load_tokenizer_from_resources


def simulate_retention_times(
Expand Down
37 changes: 24 additions & 13 deletions imspy/imspy/simulation/timsim/simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,15 +72,19 @@ def main():
help="Use the layout of the reference dataset for the acquisition (default: True)")
parser.set_defaults(use_reference_layout=True)

parser.add_argument("--no_peptide_sampling", dest="sample_peptides", action="store_false",
help="Sample peptides from the digested fasta (default: True)")
parser.set_defaults(sample_peptides=True)

# Peptide digestion arguments
parser.add_argument(
"--sample_fraction",
type=float,
default=0.005,
"--num_sample_peptides",
type=int,
default=25_000,
help="Sample fraction, fraction of peptides to be sampled at random from digested fasta (default: 0.005)")

parser.add_argument("--missed_cleavages", type=int, default=2, help="Number of missed cleavages (default: 2)")
parser.add_argument("--min_len", type=int, default=9, help="Minimum peptide length (default: 7)")
parser.add_argument("--min_len", type=int, default=7, help="Minimum peptide length (default: 7)")
parser.add_argument("--max_len", type=int, default=30, help="Maximum peptide length (default: 30)")
parser.add_argument("--cleave_at", type=str, default='KR', help="Cleave at (default: KR)")
parser.add_argument("--restrict", type=str, default='P', help="Restrict (default: P)")
Expand Down Expand Up @@ -134,11 +138,13 @@ def main():
help="Sampling step size for frame distributions (default: 0.001)")

# Number of cores to use
parser.add_argument("--num_threads", type=int, default=16, help="Number of threads to use (default: 16)")
parser.add_argument("--num_threads", type=int, default=-1, help="Number of threads to use (default: -1, all available)")
parser.add_argument("--batch_size", type=int, default=256, help="Batch size (default: 256)")

# charge state probabilities
parser.add_argument("--p_charge", type=float, default=0.5, help="Probability of being charged (default: 0.5)")
parser.add_argument("--min_charge_contrib", type=float, default=0.15,
help="Minimum charge contribution (default: 0.15)")

# Noise settings
# -- 1. RT and IM noise
Expand Down Expand Up @@ -207,7 +213,7 @@ def main():
action="store_true",
dest="proteome_mix",
)
parser.set_defaults(proteome_mixture=False)
parser.set_defaults(proteome_mix=False)

# Parse the arguments
args = parser.parse_args()
Expand All @@ -221,15 +227,15 @@ def main():
# Print table
print(tabulate(table, headers=["Argument", "Value"], tablefmt="grid"))

# Save the arguments to a file
with open(os.path.join(args.path, 'arguments.txt'), 'w') as f:
f.write(tabulate(table, headers=["Argument", "Value"], tablefmt="grid"))

# Use the arguments
path = check_path(args.path)
reference_path = check_path(args.reference_path)
name = args.name.replace('[PLACEHOLDER]', f'{args.acquisition_type}').replace("'", "")

# Save the arguments to a file, should go into the database folder
with open(os.path.join(path, f'arguments-{name}.txt'), 'w') as f:
f.write(tabulate(table, headers=["Argument", "Value"], tablefmt="grid"))

if args.proteome_mix:
factors = get_dilution_factors()

Expand Down Expand Up @@ -307,8 +313,8 @@ def main():

peptides = pd.concat(peptide_list)

if args.sample_fraction < 1.0:
peptides = peptides.sample(frac=args.sample_fraction)
if args.sample_peptides:
peptides = peptides.sample(n=args.num_sample_peptides, random_state=41)
peptides.reset_index(drop=True, inplace=True)

if verbose:
Expand All @@ -328,6 +334,10 @@ def main():
columns[-2], columns[-1] = columns[-1], columns[-2]
peptides = peptides[columns]

# get the number of available threads of the system if not specified
if args.num_threads == -1:
args.num_threads = os.cpu_count()

# JOB 4: Simulate frame distributions emg
peptides = simulate_frame_distributions_emg(
peptides=peptides,
Expand Down Expand Up @@ -355,7 +365,8 @@ def main():
peptides=peptides,
mz_lower=acquisition_builder.tdf_writer.helper_handle.mz_lower,
mz_upper=acquisition_builder.tdf_writer.helper_handle.mz_upper,
p_charge=p_charge
p_charge=p_charge,
min_charge_contrib=args.min_charge_contrib,
)

# JOB 6: Simulate ion mobilities
Expand Down
2 changes: 1 addition & 1 deletion imspy/imspy/timstof/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def get_py_ptr(self):


class TimsDataset(ABC):
def __init__(self, data_path: str, in_memory: bool = True):
def __init__(self, data_path: str, in_memory: bool = False):
"""TimsDataHandle class.
Args:
Expand Down
6 changes: 3 additions & 3 deletions imspy/imspy/timstof/dbsearch/imspy_dda.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def main():
parser.add_argument("--refinement_verbose", dest="refinement_verbose", action="store_true", help="Refinement verbose")
parser.set_defaults(refinement_verbose=False)

parser.add_argument("--score", type=str, default="hyper_score", help="Score type (default: hyper_score)")
parser.add_argument("--rescore_score", type=str, default="hyper_score", help="Score type to be used for re-scoring (default: hyper_score)")

args = parser.parse_args()

Expand Down Expand Up @@ -305,7 +305,7 @@ def main():
start_time = time.time()

scores = ["hyper_score", "beta_score"]
assert args.score in scores, f"Score type {args.score} not supported. Supported score types are: {scores}"
assert args.rescore_score in scores, f"Score type {args.rescore_score} not supported. Supported score types are: {scores}"

if args.verbose:
print(f"found {len(paths)} RAW data folders in {args.path} ...")
Expand Down Expand Up @@ -703,7 +703,7 @@ def main():
psms = list(sorted(psms, key=lambda psm: (psm.spec_idx, psm.peptide_idx)))

psms = re_score_psms(psms=psms, verbose=args.verbose, num_splits=args.re_score_num_splits,
balance=args.balanced_re_score, score=args.score)
balance=args.balanced_re_score, score=args.rescore_score)

# serialize all PSMs to JSON binary
bts = psms_to_json_bin(psms)
Expand Down
4 changes: 2 additions & 2 deletions imspy/imspy/timstof/dda.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ def _load_pasef_meta_data(self):
def get_pasef_fragments(self, num_threads: int = 1) -> pd.DataFrame:
"""Get PASEF fragments.
Args:
num_threads (int, optional): Number of threads. Defaults to 1.
Args: num_threads (int, optional): Number of threads. Defaults to 1. CAUTION: As long as connection to
datasets is established via bruker so / dll, using multiple threads is unstable.
Returns:
List[FragmentDDA]: List of PASEF fragments.
Expand Down

0 comments on commit 0b8cad2

Please sign in to comment.