Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

David@simulation #233

Merged
merged 16 commits into from
Jul 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 121 additions & 0 deletions imspy/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# imspy - Python package for working with timsTOF raw data



## Raw data access

### Establish a connection to a timsTOF raw file and access data

```python
import numpy as np
from imspy.timstof import TimsDataset

# you can use in-memory mode for faster access, but it requires more memory
tdf = TimsDataset("path/to/rawfolder.d", in_memory=False)

# show global meta data table
print(tdf.global_meta_data)

# show frame meta data
print(tdf.meta_data)

# get the first frame (bruker frame indices start at 1)
frame = tdf.get_tims_frame(1)

# you can also use indexing
frame = tdf[1]

# print data as pandas dataframe
frame.df()

# get all spectra in a tims frame (sorted by scan = ion mobility)
spectra = frame.to_tims_spectra()

# get a slice of multiple frames
frames = tdf.get_tims_slice(np.array([1, 2, 3]))

# or, by using slicing
frames = tdf[1:4]
```

### DDA data

```python
from imspy.timstof import TimsDatasetDDA
# read a DDA dataset
tdf = TimsDatasetDDA("path/to/rawfolder.d", in_memory=False)

# get raw data of precursors together with their fragment ions
dda_fragments = tdf.get_pasef_fragments()

# the timsTOF re-fragments precursors below a certain intensity threshold,
# you can aggregate the data for increased sensitivity like so:
dda_fragments_grouped = dda_fragments.groupby('precursor_id').agg({
'frame_id': 'first',
'time': 'first',
'precursor_id': 'first',
# this will sum up the raw data of all fragments with the same precursor_id
'raw_data': 'sum',
'scan_begin': 'first',
'scan_end': 'first',
'isolation_mz': 'first',
'isolation_width': 'first',
'collision_energy': 'first',
'largest_peak_mz': 'first',
'average_mz': 'first',
'monoisotopic_mz': 'first',
'charge': 'first',
'average_scan': 'first',
'intensity': 'first',
'parent_id': 'first',
})

# for convenience, you can calculate the inverse mobility
# of the precursor ion by finding the maximum intensity along the scan dimension
mobility = dda_fragments_grouped.apply(
lambda r: r.raw_data.get_inverse_mobility_along_scan_marginal(), axis=1
)

# add the inverse mobility to the grouped data as a new column
dda_fragments_grouped['mobility'] = mobility
```

### DIA data

```python
from imspy.timstof import TimsDatasetDIA
# read a DIA dataset
tdf = TimsDatasetDIA("path/to/rawfolder.d", in_memory=False)
```

## The chemistry module

### Basic usage
```python
```

### Working with peptide sequences
```python
```

## Algorithms and machine learning

### ion mobility and retention time prediction
```python
```

### Locality sensitive hashing
```python
```

### Mixture models
```python
```

## Pipeline: DDA data analysis (imspy_dda)
```python
```

## Pipeline: Synthetic raw data generation (timsim)
```python
```
6 changes: 3 additions & 3 deletions imspy/imspy/simulation/resources/configs/dilution_factors.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
proteome,dilution_factor
"HUMAN.fasta",1.0
"YEAST.fasta",0.33
"ECOLI.fasta",0.11
"HUMAN.fasta",0.65
"YEAST.fasta",0.30
"ECOLI.fasta",0.05
2 changes: 1 addition & 1 deletion imspy/imspy/simulation/timsim/jobs/build_acquisition.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from imspy.simulation.acquisition import TimsTofAcquisitionBuilderDIA
from imspy.simulation.utility import read_acquisition_config
from imspy.timstof import TimsDataset, TimsDatasetDIA
from imspy.timstof import TimsDatasetDIA


def build_acquisition(
Expand Down
7 changes: 6 additions & 1 deletion imspy/imspy/simulation/timsim/jobs/simulate_charge_states.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def simulate_charge_states(
mz_upper: float,
p_charge: float = 0.5,
charge_state_one_probability: float = 0.0,
min_charge_contrib: float = 0.15,
) -> pd.DataFrame:

IonSource = DeepChargeStateDistribution(
Expand All @@ -20,7 +21,11 @@ def simulate_charge_states(
)

# IonSource = BinomialChargeStateDistributionModel(charged_probability=p_charge)
peptide_ions = IonSource.simulate_charge_state_distribution_pandas(peptides, charge_state_one_probability=charge_state_one_probability)
peptide_ions = IonSource.simulate_charge_state_distribution_pandas(
peptides,
charge_state_one_probability=charge_state_one_probability,
min_charge_contrib=min_charge_contrib,
)

# merge tables to have sequences with ions, remove mz values outside scope
ions = pd.merge(left=peptide_ions, right=peptides, left_on=['peptide_id'], right_on=['peptide_id'])
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd

from imspy.algorithm import DeepChromatographyApex, load_tokenizer_from_resources, load_deep_retention_time_predictor
from imspy.algorithm.rt.predictors import DeepChromatographyApex, load_deep_retention_time_predictor
from imspy.algorithm.utility import load_tokenizer_from_resources


def simulate_retention_times(
Expand Down
37 changes: 24 additions & 13 deletions imspy/imspy/simulation/timsim/simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,15 +72,19 @@ def main():
help="Use the layout of the reference dataset for the acquisition (default: True)")
parser.set_defaults(use_reference_layout=True)

parser.add_argument("--no_peptide_sampling", dest="sample_peptides", action="store_false",
help="Sample peptides from the digested fasta (default: True)")
parser.set_defaults(sample_peptides=True)

# Peptide digestion arguments
parser.add_argument(
"--sample_fraction",
type=float,
default=0.005,
"--num_sample_peptides",
type=int,
default=25_000,
help="Sample fraction, fraction of peptides to be sampled at random from digested fasta (default: 0.005)")

parser.add_argument("--missed_cleavages", type=int, default=2, help="Number of missed cleavages (default: 2)")
parser.add_argument("--min_len", type=int, default=9, help="Minimum peptide length (default: 7)")
parser.add_argument("--min_len", type=int, default=7, help="Minimum peptide length (default: 7)")
parser.add_argument("--max_len", type=int, default=30, help="Maximum peptide length (default: 30)")
parser.add_argument("--cleave_at", type=str, default='KR', help="Cleave at (default: KR)")
parser.add_argument("--restrict", type=str, default='P', help="Restrict (default: P)")
Expand Down Expand Up @@ -134,11 +138,13 @@ def main():
help="Sampling step size for frame distributions (default: 0.001)")

# Number of cores to use
parser.add_argument("--num_threads", type=int, default=16, help="Number of threads to use (default: 16)")
parser.add_argument("--num_threads", type=int, default=-1, help="Number of threads to use (default: -1, all available)")
parser.add_argument("--batch_size", type=int, default=256, help="Batch size (default: 256)")

# charge state probabilities
parser.add_argument("--p_charge", type=float, default=0.5, help="Probability of being charged (default: 0.5)")
parser.add_argument("--min_charge_contrib", type=float, default=0.15,
help="Minimum charge contribution (default: 0.15)")

# Noise settings
# -- 1. RT and IM noise
Expand Down Expand Up @@ -207,7 +213,7 @@ def main():
action="store_true",
dest="proteome_mix",
)
parser.set_defaults(proteome_mixture=False)
parser.set_defaults(proteome_mix=False)

# Parse the arguments
args = parser.parse_args()
Expand All @@ -221,15 +227,15 @@ def main():
# Print table
print(tabulate(table, headers=["Argument", "Value"], tablefmt="grid"))

# Save the arguments to a file
with open(os.path.join(args.path, 'arguments.txt'), 'w') as f:
f.write(tabulate(table, headers=["Argument", "Value"], tablefmt="grid"))

# Use the arguments
path = check_path(args.path)
reference_path = check_path(args.reference_path)
name = args.name.replace('[PLACEHOLDER]', f'{args.acquisition_type}').replace("'", "")

# Save the arguments to a file, should go into the database folder
with open(os.path.join(path, f'arguments-{name}.txt'), 'w') as f:
f.write(tabulate(table, headers=["Argument", "Value"], tablefmt="grid"))

if args.proteome_mix:
factors = get_dilution_factors()

Expand Down Expand Up @@ -307,8 +313,8 @@ def main():

peptides = pd.concat(peptide_list)

if args.sample_fraction < 1.0:
peptides = peptides.sample(frac=args.sample_fraction)
if args.sample_peptides:
peptides = peptides.sample(n=args.num_sample_peptides, random_state=41)
peptides.reset_index(drop=True, inplace=True)

if verbose:
Expand All @@ -328,6 +334,10 @@ def main():
columns[-2], columns[-1] = columns[-1], columns[-2]
peptides = peptides[columns]

# get the number of available threads of the system if not specified
if args.num_threads == -1:
args.num_threads = os.cpu_count()

# JOB 4: Simulate frame distributions emg
peptides = simulate_frame_distributions_emg(
peptides=peptides,
Expand Down Expand Up @@ -355,7 +365,8 @@ def main():
peptides=peptides,
mz_lower=acquisition_builder.tdf_writer.helper_handle.mz_lower,
mz_upper=acquisition_builder.tdf_writer.helper_handle.mz_upper,
p_charge=p_charge
p_charge=p_charge,
min_charge_contrib=args.min_charge_contrib,
)

# JOB 6: Simulate ion mobilities
Expand Down
2 changes: 1 addition & 1 deletion imspy/imspy/timstof/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def get_py_ptr(self):


class TimsDataset(ABC):
def __init__(self, data_path: str, in_memory: bool = True):
def __init__(self, data_path: str, in_memory: bool = False):
"""TimsDataHandle class.

Args:
Expand Down
6 changes: 3 additions & 3 deletions imspy/imspy/timstof/dbsearch/imspy_dda.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ def main():
parser.add_argument("--refinement_verbose", dest="refinement_verbose", action="store_true", help="Refinement verbose")
parser.set_defaults(refinement_verbose=False)

parser.add_argument("--score", type=str, default="hyper_score", help="Score type (default: hyper_score)")
parser.add_argument("--rescore_score", type=str, default="hyper_score", help="Score type to be used for re-scoring (default: hyper_score)")

args = parser.parse_args()

Expand Down Expand Up @@ -305,7 +305,7 @@ def main():
start_time = time.time()

scores = ["hyper_score", "beta_score"]
assert args.score in scores, f"Score type {args.score} not supported. Supported score types are: {scores}"
assert args.rescore_score in scores, f"Score type {args.rescore_score} not supported. Supported score types are: {scores}"

if args.verbose:
print(f"found {len(paths)} RAW data folders in {args.path} ...")
Expand Down Expand Up @@ -703,7 +703,7 @@ def main():
psms = list(sorted(psms, key=lambda psm: (psm.spec_idx, psm.peptide_idx)))

psms = re_score_psms(psms=psms, verbose=args.verbose, num_splits=args.re_score_num_splits,
balance=args.balanced_re_score, score=args.score)
balance=args.balanced_re_score, score=args.rescore_score)

# serialize all PSMs to JSON binary
bts = psms_to_json_bin(psms)
Expand Down
4 changes: 2 additions & 2 deletions imspy/imspy/timstof/dda.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ def _load_pasef_meta_data(self):
def get_pasef_fragments(self, num_threads: int = 1) -> pd.DataFrame:
"""Get PASEF fragments.

Args:
num_threads (int, optional): Number of threads. Defaults to 1.
Args: num_threads (int, optional): Number of threads. Defaults to 1. CAUTION: As long as connection to
datasets is established via bruker so / dll, using multiple threads is unstable.

Returns:
List[FragmentDDA]: List of PASEF fragments.
Expand Down
Loading