From 3946957fc70e5ec1a2136a74e7011736f4e401d7 Mon Sep 17 00:00:00 2001 From: Kathy Date: Tue, 6 Dec 2022 11:01:35 -0500 Subject: [PATCH 01/62] selene changes for methylation model --- selene_sdk/samplers/dataloader.py | 4 +- .../samplers/methylation_data_sampler.py | 412 ++++++++++ selene_sdk/samplers/multi_sampler.py | 22 +- selene_sdk/targets/genomic_features_h5.py | 256 ++++++ selene_sdk/train_methylation_model.py | 749 ++++++++++++++++++ .../utils/methylation_performance_metrics.py | 315 ++++++++ .../utils/non_strand_specific_module.py | 9 +- 7 files changed, 1752 insertions(+), 15 deletions(-) create mode 100644 selene_sdk/samplers/methylation_data_sampler.py create mode 100644 selene_sdk/targets/genomic_features_h5.py create mode 100644 selene_sdk/train_methylation_model.py create mode 100644 selene_sdk/utils/methylation_performance_metrics.py diff --git a/selene_sdk/samplers/dataloader.py b/selene_sdk/samplers/dataloader.py index 166efddd..d6b07656 100644 --- a/selene_sdk/samplers/dataloader.py +++ b/selene_sdk/samplers/dataloader.py @@ -55,12 +55,12 @@ def __getitem__(self, index): The shape of `targets` will be :math:`I \\times T`, where :math:`T` is the number of targets predicted. """ - sequences, targets = self.sampler.sample( + sequences, targets, inds = self.sampler.sample( batch_size=1 if isinstance(index, int) else len(index)) if sequences.shape[0] == 1: sequences = sequences[0,:] targets = targets[0,:] - return sequences, targets + return sequences, targets, inds def __len__(self): """ diff --git a/selene_sdk/samplers/methylation_data_sampler.py b/selene_sdk/samplers/methylation_data_sampler.py new file mode 100644 index 00000000..b2c68719 --- /dev/null +++ b/selene_sdk/samplers/methylation_data_sampler.py @@ -0,0 +1,412 @@ +""" +This module provides the RandomPositionsSampler class. + +TODO: Currently, only works with sequences from `selene_sdk.sequences.Genome`. +We would like to generalize this to `selene_sdk.sequences.Sequence` if possible. +""" +from collections import namedtuple +import logging +import random + +import numpy as np + +from functools import wraps +from .online_sampler import OnlineSampler +from ..utils import get_indices_and_probabilities + +logger = logging.getLogger(__name__) + + +SampleIndices = namedtuple( + "SampleIndices", ["indices", "weights"]) +""" +A tuple containing the indices for some samples, and a weight to +allot to each index when randomly drawing from them. + +TODO: this is common to both the intervals sampler and the +random positions sampler. Can we move this to utils or +somewhere else? + +Parameters +---------- +indices : list(int) + The numeric index of each sample. +weights : list(float) + The amount of weight assigned to each sample. + +Attributes +---------- +indices : list(int) + The numeric index of each sample. +weights : list(float) + The amount of weight assigned to each sample. + +""" + + +class MethylationDataSampler(OnlineSampler): + """This sampler randomly selects a position in the genome and queries for + a sequence centered at that position for input to the model. + + TODO: generalize to selene_sdk.sequences.Sequence? + + Parameters + ---------- + reference_sequence : selene_sdk.sequences.Genome + A reference sequence from which to create examples. + targets : GenomicFeaturesH5 + features : list(str) + List of distinct features that we aim to predict. + seed : int, optional + Default is 436. Sets the random seed for sampling. + validation_holdout : list(str) or float, optional + Default is `['chr6', 'chr7']`. Holdout can be regional or + proportional. If regional, expects a list (e.g. `['chrX', 'chrY']`). + Regions must match those specified in the first column of the + tabix-indexed BED file. If proportional, specify a percentage + between (0.0, 1.0). Typically 0.10 or 0.20. + test_holdout : list(str) or float, optional + Default is `['chr8', 'chr9']`. See documentation for + `validation_holdout` for additional information. + sequence_length : int, optional + Default is 1000. Model is trained on sequences of `sequence_length` + where genomic features are annotated to the center regions of + these sequences. + center_bin_to_predict : int, optional + Default is 200. Query the tabix-indexed file for a region of + length `center_bin_to_predict`. + mode : {'train', 'validate', 'test'} + Default is `'train'`. The mode to run the sampler in. + save_datasets : list(str), optional + Default is `['test']`. The list of modes for which we should + save the sampled data to file. + output_dir : str or None, optional + Default is None. The path to the directory where we should + save sampled examples for a mode. If `save_datasets` is + a non-empty list, `output_dir` must be specified. If + the path in `output_dir` does not exist it will be created + automatically. + + Attributes + ---------- + reference_sequence : selene_sdk.sequences.Genome + The reference sequence that examples are created from. + target : selene_sdk.targets.Target + The `selene_sdk.targets.Target` object holding the features that we + would like to predict. + validation_holdout : list(str) or float + The samples to hold out for validating model performance. These + can be "regional" or "proportional". If regional, this is a list + of region names (e.g. `['chrX', 'chrY']`). These regions must + match those specified in the first column of the tabix-indexed + BED file. If proportional, this is the fraction of total samples + that will be held out. + test_holdout : list(str) or float + The samples to hold out for testing model performance. See the + documentation for `validation_holdout` for more details. + sequence_length : int + The length of the sequences to train the model on. + modes : list(str) + The list of modes that the sampler can be run in. + mode : str + The current mode that the sampler is running in. Must be one of + the modes listed in `modes`. + + """ + def __init__(self, + reference_sequence, + target_path, + features, + seed=436, + validation_holdout=['chr6', 'chr7'], + test_holdout=['chr8', 'chr9'], + sequence_length=1000, + center_bin_to_predict=200, + feature_thresholds=0.5, + mode="train", + save_datasets=[], + output_dir=None): + super(MethylationDataSampler, self).__init__( + reference_sequence, + target_path, + features, + seed=seed, + validation_holdout=validation_holdout, + test_holdout=test_holdout, + sequence_length=sequence_length, + center_bin_to_predict=center_bin_to_predict, + feature_thresholds=feature_thresholds, + mode=mode, + save_datasets=save_datasets, + output_dir=output_dir) + + self._sample_from_mode = {} + self._randcache = {} + for mode in self.modes: + self._sample_from_mode[mode] = None + self._randcache[mode] = {"cache_indices": None, "sample_next": 0} + + self.sample_from_intervals = [] + self.interval_lengths = [] + self._initialized = False + + def init(func): + # delay initialization to allow multiprocessing + @wraps(func) + def dfunc(self, *args, **kwargs): + if not self._initialized: + if self._holdout_type == "chromosome": + self._partition_genome_by_chromosome() + else: + self._partition_genome_by_proportion() + + for mode in self.modes: + self._update_randcache(mode=mode) + self._initialized = True + return func(self, *args, **kwargs) + return dfunc + + + def _partition_genome_by_proportion(self): + for chrom, len_chrom in self.reference_sequence.get_chr_lens(): + if '_' in chrom: + continue + if 'X' in chrom or 'Y' in chrom: + continue + self.sample_from_intervals.append( + (chrom, + self.sequence_length, + len_chrom - self.sequence_length)) + self.interval_lengths.append(len_chrom) + n_intervals = len(self.sample_from_intervals) + + select_indices = list(range(n_intervals)) + np.random.shuffle(select_indices) + n_indices_validate = int(n_intervals * self.validation_holdout) + val_indices, val_weights = get_indices_and_probabilities( + self.interval_lengths, select_indices[:n_indices_validate]) + self._sample_from_mode["validate"] = SampleIndices( + val_indices, val_weights) + + if self.test_holdout: + n_indices_test = int(n_intervals * self.test_holdout) + test_indices_end = n_indices_test + n_indices_validate + test_indices, test_weights = get_indices_and_probabilities( + self.interval_lengths, + select_indices[n_indices_validate:test_indices_end]) + self._sample_from_mode["test"] = SampleIndices( + test_indices, test_weights) + + tr_indices, tr_weights = get_indices_and_probabilities( + self.interval_lengths, select_indices[test_indices_end:]) + self._sample_from_mode["train"] = SampleIndices( + tr_indices, tr_weights) + else: + tr_indices, tr_weights = get_indices_and_probabilities( + self.interval_lengths, select_indices[n_indices_validate:]) + self._sample_from_mode["train"] = SampleIndices( + tr_indices, tr_weights) + + def _partition_genome_by_chromosome(self): + for mode in self.modes: + self._sample_from_mode[mode] = SampleIndices([], []) + index = 0 + for (chrom, len_chrom) in self.reference_sequence.get_chr_lens(): + if '_' in chrom: + continue + if 'X' in chrom or 'Y' in chrom: + continue + if chrom in self.validation_holdout: + self._sample_from_mode["validate"].indices.append( + index) + elif self.test_holdout and chrom in self.test_holdout: + self._sample_from_mode["test"].indices.append( + index) + else: + self._sample_from_mode["train"].indices.append( + index) + + self.sample_from_intervals.append( + (chrom, + self.sequence_length, + len_chrom - self.sequence_length)) + self.interval_lengths.append(len_chrom - 2 * self.sequence_length) + index += 1 + + for mode in self.modes: + sample_indices = self._sample_from_mode[mode].indices + indices, weights = get_indices_and_probabilities( + self.interval_lengths, sample_indices) + self._sample_from_mode[mode] = \ + self._sample_from_mode[mode]._replace( + indices=indices, weights=weights) + + def _retrieve(self, chrom, position): + bin_start = position - self._start_radius + bin_end = position + self._end_radius + indicator = self.target.is_positive( + chrom, bin_start, bin_end) + retrieved_targets = self.target.get_feature_data( + chrom, bin_start, bin_end) + window_start = position - self._start_window_radius + window_end = position + self._end_window_radius + if window_end - window_start < self.sequence_length: + print(bin_start, bin_end, + self._start_radius, self._end_radius, + self._start_window_radius, self._end_window_radius,) + return None + strand = self.STRAND_SIDES[random.randint(0, 1)] + retrieved_seq = \ + self.reference_sequence.get_encoding_from_coords( + chrom, window_start, window_end, strand) + + if retrieved_seq.shape[0] == 0: + logger.info("Full sequence centered at {0} position {1} " + "could not be retrieved. Sampling again.".format( + chrom, position)) + return None + elif np.mean(retrieved_seq==0.25) > 0.30: + logger.info("Over 30% of the bases in the sequence centered " + "at {0} position {1} are ambiguous ('N'). " + "Sampling again.".format(chrom, position)) + return None + + if self.mode in self._save_datasets: + feature_indices = ';'.join( + [str(f) for f in np.nonzero(retrieved_targets)[0]]) + self._save_datasets[self.mode].append( + [chrom, + window_start, + window_end, + strand, + feature_indices]) + if len(self._save_datasets[self.mode]) > 200000: + self.save_dataset_to_file(self.mode) + return (retrieved_seq, retrieved_targets, indicator) + + def _update_randcache(self, mode=None): + if not mode: + mode = self.mode + self._randcache[mode]["cache_indices"] = np.random.choice( + self._sample_from_mode[mode].indices, + size=200000, + replace=True, + p=self._sample_from_mode[mode].weights) + self._randcache[mode]["sample_next"] = 0 + + @init + def sample(self, batch_size=1, mode=None): + """ + Randomly draws a mini-batch of examples and their corresponding + labels. + + Parameters + ---------- + batch_size : int, optional + Default is 1. The number of examples to include in the + mini-batch. + mode : str, optional + Default is None. The operating mode that the object should run in. + If None, will use the current mode `self.mode`. + + Returns + ------- + sequences, targets : tuple(numpy.ndarray, numpy.ndarray) + A tuple containing the numeric representation of the + sequence examples and their corresponding labels. The + shape of `sequences` will be + :math:`B \\times L \\times N`, where :math:`B` is + `batch_size`, :math:`L` is the sequence length, and + :math:`N` is the size of the sequence type's alphabet. + The shape of `targets` will be :math:`B \\times F`, + where :math:`F` is the number of features. + + """ + mode = mode if mode else self.mode + sequences = np.zeros((batch_size, self.sequence_length, 4)) + targets = np.zeros((batch_size, self.n_features)) + indicator = np.zeros(batch_size) + n_samples_drawn = 0 + while n_samples_drawn < batch_size: + sample_index = self._randcache[mode]["sample_next"] + if sample_index == len(self._randcache[mode]["cache_indices"]): + self._update_randcache() + sample_index = 0 + + rand_interval_index = \ + self._randcache[mode]["cache_indices"][sample_index] + self._randcache[mode]["sample_next"] += 1 + + chrom, cstart, cend = \ + self.sample_from_intervals[rand_interval_index] + position = np.random.randint(cstart, cend) + + retrieve_output = self._retrieve(chrom, position) + if not retrieve_output: + continue + seq, seq_targets, methyl = retrieve_output + sequences[n_samples_drawn, :, :] = seq + targets[n_samples_drawn, :] = seq_targets + indicator[n_samples_drawn] = methyl + n_samples_drawn += 1 + return (sequences, targets, indicator) + + + def get_data_and_targets(self, batch_size, n_samples=None, mode=None): + """ + This method fetches a subset of the data from the sampler, + divided into batches. This method also allows the user to + specify what operating mode to run the sampler in when fetching + the data. + + Parameters + ---------- + batch_size : int + The size of the batches to divide the data into. + n_samples : int or None, optional + Default is None. The total number of samples to retrieve. + If `n_samples` is None and the mode is `validate`, will + set `n_samples` to 32000; if the mode is `test`, will set + `n_samples` to 640000 if it is None. If the mode is `train` + you must have specified a value for `n_samples`. + mode : str, optional + Default is None. The mode to run the sampler in when + fetching the samples. See + `selene_sdk.samplers.IntervalsSampler.modes` for more + information. If None, will use the current mode `self.mode`. + + Returns + ------- + sequences_and_targets, targets_matrix : \ + tuple(list(tuple(numpy.ndarray, numpy.ndarray)), numpy.ndarray) + Tuple containing the list of sequence-target pairs, as well + as a single matrix with all targets in the same order. + Note that `sequences_and_targets`'s sequence elements are of + the shape :math:`B \\times L \\times N` and its target + elements are of the shape :math:`B \\times F`, where + :math:`B` is `batch_size`, :math:`L` is the sequence length, + :math:`N` is the size of the sequence type's alphabet, and + :math:`F` is the number of features. Further, + `target_matrix` is of the shape :math:`S \\times F`, where + :math:`S =` `n_samples`. + + """ + if mode is not None: + self.set_mode(mode) + else: + mode = self.mode + sequences_and_targets = [] + if n_samples is None and mode == "validate": + n_samples = 32000 + elif n_samples is None and mode == "test": + n_samples = 640000 + + n_batches = int(n_samples / batch_size) + for _ in range(n_batches): + inputs, targets, indicator = self.sample(batch_size) + sequences_and_targets.append((inputs, targets, indicator)) + targets_mat = np.vstack([t for (s, t, i) in sequences_and_targets]) + ind_mat = np.hstack([i for (s, t, i) in sequences_and_targets]) + if mode in self._save_datasets: + self.save_dataset_to_file(mode, close_filehandle=True) + return sequences_and_targets, targets_mat, ind_mat diff --git a/selene_sdk/samplers/multi_sampler.py b/selene_sdk/samplers/multi_sampler.py index 2e4eb6fe..c5f0ecba 100644 --- a/selene_sdk/samplers/multi_sampler.py +++ b/selene_sdk/samplers/multi_sampler.py @@ -216,13 +216,13 @@ def sample(self, batch_size=1, mode=None): else: self._set_batch_size(batch_size, mode=mode) try: - data, targets = next(self._iterators[mode]) - return data.numpy(), targets.numpy() + data, targets, ind = next(self._iterators[mode]) + return data.numpy(), targets.numpy(), ind.numpy() except StopIteration: #If DataLoader iterator reaches its length, reinitialize self._iterators[mode] = iter(self._dataloaders[mode]) - data, targets = next(self._iterators[mode]) - return data.numpy(), targets.numpy() + data, targets, ind = next(self._iterators[mode]) + return data.numpy(), targets.numpy(), ind.numpy() def get_data_and_targets(self, batch_size, n_samples=None, mode=None): """ @@ -260,18 +260,22 @@ def get_data_and_targets(self, batch_size, n_samples=None, mode=None): self._set_batch_size(batch_size, mode=mode) data_and_targets = [] targets_mat = [] + ind_mat = [] count = batch_size while count < n_samples: - data, tgts = self.sample(batch_size=batch_size, mode=mode) - data_and_targets.append((data, tgts)) + data, tgts, ind = self.sample(batch_size=batch_size, mode=mode) + data_and_targets.append((data, tgts, ind)) targets_mat.append(tgts) + ind_mat.append(ind) count += batch_size remainder = batch_size - (count - n_samples) - data, tgts = self.sample(batch_size=remainder) - data_and_targets.append((data, tgts)) + data, tgts, ind = self.sample(batch_size=remainder) + data_and_targets.append((data, tgts, ind)) targets_mat.append(tgts) + ind_mat.append(ind) targets_mat = np.vstack(targets_mat) - return data_and_targets, targets_mat + ind_mat = np.hstack(ind_mat) + return data_and_targets, targets_mat, ind_mat def get_validation_set(self, batch_size, n_samples=None): """ diff --git a/selene_sdk/targets/genomic_features_h5.py b/selene_sdk/targets/genomic_features_h5.py new file mode 100644 index 00000000..cd3bd796 --- /dev/null +++ b/selene_sdk/targets/genomic_features_h5.py @@ -0,0 +1,256 @@ +""" +This class contains methods to query a file of genomic coordinates, +where each row of [start, end) coordinates corresponds to a genomic target +in the sequence. + +It accepts the path to a tabix-indexed .bed.gz file of genomic coordinates and +the path to an HDF5 file containing the continuous-valued targets as a matrix. + +This .tsv/.bed file must contain the following columns, in order: + chrom ('1', '2', ..., 'X', 'Y'), start (0-based), end, index +where the index is the index of the corresponding row in the HDF5 file. + +Additionally, the column names should be omitted from the file itself +(i.e. there is no header and the first line in the file is the first +row of genome coordinates for a target). +""" +import types + +import h5py +import numpy as np +import tabix + +from functools import wraps +from .target import Target + + +def _get_target_data(chrom, start, end, + thresholds, target_index_dict, get_target_rows): + """ + Generates a target vector for the given query region. + + Parameters + ---------- + chrom : str + The name of the region (e.g. 'chr1', 'chr2', ..., 'chrX', + 'chrY') to query inside of. + start : int + The 0-based start coordinate of the region to query. + end : int + One past the last coordinate of the region to query. + thresholds : np.ndarray, dtype=numpy.float32 + An array of target thresholds, where the value in position + `i` corresponds to the threshold for the target name that is + mapped to index `i` by `target_index_dict`. + target_index_dict : dict + A dictionary mapping target names (`str`) to indices (`int`), + where the index is the position of the target in `targets`. + get_target_rows : types.FunctionType + A function that takes coordinates and returns rows + (`list(tuple(int, int, str))`). + + Returns + ------- + numpy.ndarray, dtype=int + A target vector where the `i`th position is equal to one if the + `i`th target is positive, and zero otherwise. + + """ + rows = get_target_rows(chrom, start, end) + return _fast_get_target_data( + start, end, thresholds, target_index_dict, rows) + + +class GenomicFeaturesH5(Target): + """ + Stores the dataset specifying sequence regions and targets. + Accepts a tabix-indexed `*.bed` file with the following columns, + in order: + :: + [chrom, start, end, strand, index] + + and an HDF5 file of the target values in a matrix with key `targets`. + + Note that `chrom` is interchangeable with any sort of region (e.g. + a protein in a FAA file). Further, `start` is 0-based. The `index` + corresponds to the row index of the targets in the HDF5 file. Lastly, any + addition columns following the five shown above will be ignored. + + Parameters + ---------- + tabix_path : str + Path to the tabix-indexed dataset. Note that for the file to + be tabix-indexed, it must have been compressed with `bgzip`. + Thus, `input_path` should be a `*.gz` file with a + corresponding `*.tbi` file in the same directory. + h5_path : str + Path to the HDF5 file of the targets matrix, with key `targets`. + targets : list(str) + The non-redundant list of genomic targets (i.e. labels) + that will be predicted. + init_unpicklable : bool, optional + Default is False. Delays initialization until a relevant method + is called. This enables the object to be pickled after instantiation. + `init_unpicklable` must be `False` when multi-processing is needed e.g. + DataLoader. Set `init_unpicklable` to True if you are using this class + directly through Selene's API and want to access class attributes + without having to call on a specific method in GenomicFeaturesH5. + + Attributes + ---------- + data : tabix.open + The data stored in a tabix-indexed `*.bed` file. + n_targets : int + The number of distinct targets. + target_index_dict : dict + A dictionary mapping target names (`str`) to indices (`int`), + where the index is the position of the target in `targets`. + index_target_dict : dict + A dictionary mapping indices (`int`) to target names (`str`), + where the index is the position of the target in the input + targets. + target_thresholds : dict or None + + * `dict` - A dictionary mapping target names (`str`) to thresholds\ + (`float`), where the threshold is the minimum overlap that a\ + target annotation must have with a query region to be\ + considered a positive example of that target. + * `None` - No threshold specifications. Assumes that all targets\ + returned by a tabix query are annotated to the query region. + + """ + + def __init__(self, tabix_path, h5_path, targets, init_unpicklable=False): + """ + Constructs a new `GenomicFeaturesH5` object. + """ + self.tabix_path = tabix_path + self.h5_path = h5_path + + self.n_targets = len(targets) + + self.target_index_dict = dict( + [(feat, index) for index, feat in enumerate(targets)]) + self.index_target_dict = dict(list(enumerate(targets))) + + self._initialized = False + + if init_unpicklable: + self._unpicklable_init() + + def _unpicklable_init(self): + if not self._initialized: + self.coords = tabix.open(self.tabix_path) + self.data = h5py.File(self.h5_path, 'r')['targets'] + self._initialized = True + + def init(func): + # delay initialization to allow multiprocessing + @wraps(func) + def dfunc(self, *args, **kwargs): + self._unpicklable_init() + return func(self, *args, **kwargs) + return dfunc + + def _query_tabix(self, chrom, start, end): + """ + Queries a tabix-indexed `*.bed` file for targets falling into + the specified region. + + Parameters + ---------- + chrom : str + The name of the region (e.g. '1', '2', ..., 'X', 'Y') to + query in. + start : int + The 0-based start position of the query coordinates. + end : int + One past the last position of the query coordinates. + + Returns + ------- + list(list(str)) or None + A list, wherein each sub-list corresponds to a line from the + tabix-indexed file, and each value in a sub-list corresponds + to a column in that row. If a `tabix.TabixError` is caught, + we assume it was because there were no targets present in + the query region, and return `None`. + + """ + try: + return self.coords.query(chrom, start, end) + except tabix.TabixError: + return None + + @init + def is_positive(self, chrom, start, end): + """ + Determines whether the query the `chrom` queried contains any + genomic targets within the :math:`[start, end)` region. If so, + the query is considered positive. + + Parameters + ---------- + chrom : str + The name of the region (e.g. '1', '2', ..., 'X', 'Y'). + start : int + The 0-based first position in the region. + end : int + One past the 0-based last position in the region. + Returns + ------- + bool + `True` if this meets the criterion for a positive example, + `False` otherwise. + Note that if we catch a `tabix.TabixError` exception, we + assume the error was the result of no targets being present + in the queried region and return `False`. + """ + rows = self._query_tabix(chrom, start, end) + if rows is None: + return False + try: + rows.__next__() + return True + except StopIteration: + return False + + @init + def get_feature_data(self, chrom, start, end): + """ + Computes which targets overlap with the given region. + + Parameters + ---------- + chrom : str + The name of the region (e.g. '1', '2', ..., 'X', 'Y'). + start : int + The 0-based first position in the region. + end : int + One past the 0-based last position in the region. + + Returns + ------- + numpy.ndarray + If the tabix query finds an overlap with the input region, + `get_feature_data` will return a target vector of size + `self.n_targets`, retrived from the matrix stored in HDF5 at the + specified row index. If multiple overlaps are found, returns + the average of all the overlap target rows. + + If we catch a `tabix.TabixError`, we assume the error was + the result of there being no targets present in the queried region + and return a `numpy.ndarray` of NaNs. + + """ + nans = np.zeros(self.n_targets) * np.nan + rows = self._query_tabix(chrom, start, end) + if not rows: + return nans + row_targets = [] + for r in rows: + target = int(r[3]) + row_targets.append(self.data[target]) + if len(row_targets) == 0: + return nans + return np.average(np.vstack(row_targets), axis=0) diff --git a/selene_sdk/train_methylation_model.py b/selene_sdk/train_methylation_model.py new file mode 100644 index 00000000..f6a79a75 --- /dev/null +++ b/selene_sdk/train_methylation_model.py @@ -0,0 +1,749 @@ +""" +This module provides the `TrainModel` class and supporting methods. +""" +import logging +import math +import os +import shutil +from time import strftime +from time import time + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.optim.lr_scheduler import ReduceLROnPlateau +from sklearn.metrics import average_precision_score +from sklearn.metrics import f1_score +from sklearn.metrics import roc_auc_score + +from .utils import initialize_logger +from .utils import load_model_from_state_dict +from .utils.methylation_performance_metrics import MethylationPerformanceMetrics + +logger = logging.getLogger("selene") + + +def _metrics_logger(name, out_filepath): + logger = logging.getLogger("{0}".format(name)) + logger.setLevel(logging.INFO) + formatter = logging.Formatter("%(message)s") + file_handle = logging.FileHandler( + os.path.join(out_filepath, "{0}.txt".format(name))) + file_handle.setFormatter(formatter) + logger.addHandler(file_handle) + return logger + + +class TrainMethylationModel(object): + """ + This class ties together the various objects and methods needed to + train and validate a model. + + TrainModel saves a checkpoint model (overwriting it after + `save_checkpoint_every_n_steps`) as well as a best-performing model + (overwriting it after `report_stats_every_n_steps` if the latest + validation performance is better than the previous best-performing + model) to `output_dir`. + + TrainModel also outputs 2 files that can be used to monitor training + as Selene runs: `selene_sdk.train_model.train.txt` (training loss) and + `selene_sdk.train_model.validation.txt` (validation loss & average + ROC AUC). The columns in these files can be used to quickly visualize + training history (e.g. you can use `matplotlib`, `plt.plot(auc_list)`) + and see, for example, whether the model is still improving, if there are + signs of overfitting, etc. + + Parameters + ---------- + model : torch.nn.Module + The model to train. + data_sampler : selene_sdk.samplers.Sampler + The example generator. + optimizer_class : torch.optim.Optimizer + The optimizer to minimize loss with. + optimizer_kwargs : dict + The dictionary of keyword arguments to pass to the optimizer's + constructor. + batch_size : int + Specify the batch size to process examples. Should be a power of 2. + max_steps : int + The maximum number of mini-batches to iterate over. + report_stats_every_n_steps : int + The frequency with which to report summary statistics. You can + set this value to be equivalent to a training epoch + (`n_steps * batch_size`) being the total number of samples + seen by the model so far. Selene evaluates the model on the validation + dataset every `report_stats_every_n_steps` and, if the model obtains + the best performance so far (based on the user-specified loss function), + Selene saves the model state to a file called `best_model.pth.tar` in + `output_dir`. + output_dir : str + The output directory to save model checkpoints and logs in. + save_checkpoint_every_n_steps : int or None, optional + Default is 1000. If None, set to the same value as + `report_stats_every_n_steps` + save_new_checkpoints_after_n_steps : int or None, optional + Default is None. The number of steps after which Selene will + continually save new checkpoint model weights files + (`checkpoint-.pth.tar`) every + `save_checkpoint_every_n_steps`. Before this point, + the file `checkpoint.pth.tar` is overwritten every + `save_checkpoint_every_n_steps` to limit the memory requirements. + n_validation_samples : int or None, optional + Default is `None`. Specify the number of validation samples in the + validation set. If `n_validation_samples` is `None` and the data sampler + used is the `selene_sdk.samplers.IntervalsSampler` or + `selene_sdk.samplers.RandomSampler`, we will retrieve 32000 + validation samples. If `None` and using + `selene_sdk.samplers.MultiSampler`, we will use all + available validation samples from the appropriate data file. + n_test_samples : int or None, optional + Default is `None`. Specify the number of test samples in the test set. + If `n_test_samples` is `None` and + + - the sampler you specified has no test partition, you should not + specify `evaluate` as one of the operations in the `ops` list. + That is, Selene will not automatically evaluate your trained + model on a test dataset, because the sampler you are using does + not have any test data. + - the sampler you use is of type `selene_sdk.samplers.OnlineSampler` + (and the test partition exists), we will retrieve 640000 test + samples. + - the sampler you use is of type + `selene_sdk.samplers.MultiSampler` (and the test partition + exists), we will use all the test samples available in the + appropriate data file. + + cpu_n_threads : int, optional + Default is 1. Sets the number of OpenMP threads used for parallelizing + CPU operations. + use_cuda : bool, optional + Default is `False`. Specify whether a CUDA-enabled GPU is available + for torch to use during training. + data_parallel : bool, optional + Default is `False`. Specify whether multiple GPUs are available + for torch to use during training. + logging_verbosity : {0, 1, 2}, optional + Default is 2. Set the logging verbosity level. + + * 0 - Only warnings will be logged. + * 1 - Information and warnings will be logged. + * 2 - Debug messages, information, and warnings will all be\ + logged. + + checkpoint_resume : str or None, optional + Default is `None`. If `checkpoint_resume` is not None, it should be the + path to a model file generated by `torch.save` that can now be read + using `torch.load`. + use_scheduler : bool, optional + Default is `True`. If `True`, learning rate scheduler is used to + reduce learning rate on plateau. PyTorch ReduceLROnPlateau scheduler + with patience=16 and factor=0.8 is used. Different scheduler parameters + can be specified with `scheduler_kwargs`. + deterministic : bool, optional + Default is `False`. If `True`, will set + `torch.backends.cudnn.deterministic` to True and + `torch.backends.cudnn.benchmark = False`. In Selene CLI, + if `random_seed` is set in the configuration YAML, Selene automatically + passes in `deterministic=True` to the TrainModel class. + scheduler_kwargs : dict, optional + Default is patience=16, verbose=True, and factor=0.8. Set the parameters + for the PyTorch ReduceLROnPlateau scheduler. + stopping_criteria : list or None, optional + Default is `None`. If `stopping_criteria` is not None, it should be a + list specifying how to use early stopping. The first value should be + a str corresponding to one of `metrics`. The second value should be an + int indicating the patience. If the specified metric does not improve + in the given patience (usually corresponding to the number of epochs), + training stops early. + + Attributes + ---------- + model : torch.nn.Module + The model to train. + sampler : selene_sdk.samplers.Sampler + The example generator. + criterion : torch.nn._Loss + The loss function to optimize. + optimizer : torch.optim.Optimizer + The optimizer to minimize loss with. + batch_size : int + The size of the mini-batch to use during training. + max_steps : int + The maximum number of mini-batches to iterate over. + nth_step_report_stats : int + The frequency with which to report summary statistics. + nth_step_save_checkpoint : int + The frequency with which to save a model checkpoint. + use_cuda : bool + If `True`, use a CUDA-enabled GPU. If `False`, use the CPU. + data_parallel : bool + Whether to use multiple GPUs or not. + output_dir : str + The directory to save model checkpoints and logs. + + """ + + def __init__(self, + model, + data_sampler, + optimizer_class, + optimizer_kwargs, + batch_size, + max_steps, + report_stats_every_n_steps, + output_dir, + save_checkpoint_every_n_steps=1000, + save_new_checkpoints_after_n_steps=None, + report_gt_feature_n_positives=10, + n_validation_samples=None, + n_test_samples=None, + cpu_n_threads=1, + use_cuda=False, + data_parallel=False, + logging_verbosity=2, + checkpoint_resume=None, + metrics=dict(roc_auc=roc_auc_score, + average_precision=average_precision_score), + use_scheduler=True, + deterministic=False, + scheduler_kwargs=dict(patience=16, + verbose=True, + factor=0.8), + stopping_criteria=None): + """ + Constructs a new `TrainModel` object. + """ + self.model = model + self.sampler = data_sampler + + # these can be made parameters if desired + self.cls_criterion = nn.BCELoss() + self.reg_criterion = F.mse_loss + + self.optimizer = optimizer_class( + self.model.parameters(), **optimizer_kwargs) + + self.batch_size = batch_size + self.max_steps = max_steps + self.nth_step_report_stats = report_stats_every_n_steps + self.nth_step_save_checkpoint = None + + if not save_checkpoint_every_n_steps: + self.nth_step_save_checkpoint = report_stats_every_n_steps + else: + self.nth_step_save_checkpoint = save_checkpoint_every_n_steps + + self._save_new_checkpoints = save_new_checkpoints_after_n_steps + + os.makedirs(output_dir, exist_ok=True) + self.output_dir = output_dir + + initialize_logger( + os.path.join(self.output_dir, "{0}.log".format(__name__)), + verbosity=logging_verbosity) + + if deterministic: + logger.info("Setting deterministic = True for reproducibility.") + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + + logger.info("Training parameters set: batch size {0}, " + "number of steps per 'epoch': {1}, " + "maximum number of steps: {2}".format( + self.batch_size, + self.nth_step_report_stats, + self.max_steps)) + + torch.set_num_threads(cpu_n_threads) + + self.use_cuda = use_cuda + self.data_parallel = data_parallel + + if self.data_parallel: + self.model = nn.DataParallel(model) + logger.debug("Wrapped model in DataParallel") + + if self.use_cuda: + self.model.cuda() + #self.cls_criterion.cuda() + #self.reg_criterion.cuda() + logger.debug("Set modules to use CUDA") + + self._report_gt_feature_n_positives = report_gt_feature_n_positives + self._metrics = dict(cls_average_precision=average_precision_score, + cls_f1=f1_score) + self._n_validation_samples = n_validation_samples + self._n_test_samples = n_test_samples + self._use_scheduler = use_scheduler + + self._init_train(scheduler_kwargs) + self._init_validate() + if "test" in self.sampler.modes: + self._init_test() + if checkpoint_resume is not None: + self._load_checkpoint(checkpoint_resume) + + if type(stopping_criteria) is list and len(stopping_criteria) == 2: + stopping_metric, stopping_patience = stopping_criteria + self._early_stopping = True + if stopping_metric in self._metrics: + self._stopping_metric = stopping_metric + self._stopping_patience = stopping_patience + self._stopping_reached = False + else: + logger.warning("Did not recognize stopping metric. Not performing early stopping.") + self._early_stopping = False + else: + self._early_stopping = False + + def _load_checkpoint(self, checkpoint_resume): + checkpoint = torch.load( + checkpoint_resume, + map_location=lambda storage, location: storage) + if "state_dict" not in checkpoint: + raise ValueError( + ("'state_dict' not found in file {0} " + "loaded with method `torch.load`. Selene does not support " + "continued training of models that were not originally " + "trained using Selene.").format(checkpoint_resume)) + + self.model = load_model_from_state_dict( + checkpoint["state_dict"], self.model) + + self._start_step = checkpoint["step"] + if self._start_step >= self.max_steps: + self.max_steps += self._start_step + + self._min_loss = checkpoint["min_loss"] + self.optimizer.load_state_dict( + checkpoint["optimizer"]) + if self.use_cuda: + for state in self.optimizer.state.values(): + for k, v in state.items(): + if isinstance(v, torch.Tensor): + state[k] = v.cuda() + + logger.info( + ("Resuming from checkpoint: step {0}, min loss {1}").format( + self._start_step, self._min_loss)) + + def _init_train(self, scheduler_kwargs): + self._start_step = 0 + self._train_logger = _metrics_logger( + "{0}.train".format(__name__), self.output_dir) + self._train_logger.info("loss") + if self._use_scheduler: + self.scheduler = ReduceLROnPlateau( + self.optimizer, + 'min', + **scheduler_kwargs) + self._time_per_step = [] + self._train_loss = [] + + def _init_validate(self): + self._min_loss = float("inf") # TODO: Should this be set when it is used later? Would need to if we want to train model 2x in one run. + self._create_validation_set(n_samples=self._n_validation_samples) + self._validation_metrics = MethylationPerformanceMetrics( + self.sampler.get_feature_from_index, + report_gt_feature_n_positives=self._report_gt_feature_n_positives, + metrics=self._metrics) + self._validation_logger = _metrics_logger( + "{0}.validation".format(__name__), self.output_dir) + + self._validation_logger.info("\t".join(["loss"] + + sorted([x for x in self._validation_metrics.metrics.keys()]))) + + def _init_test(self): + self._test_data = None + self._n_test_samples = self._n_test_samples + self._test_metrics = MethylationPerformanceMetrics( + self.sampler.get_feature_from_index, + report_gt_feature_n_positives=self._report_gt_feature_n_positives, + metrics=self._metrics) + + def _create_validation_set(self, n_samples=None): + """ + Generates the set of validation examples. + + Parameters + ---------- + n_samples : int or None, optional + Default is `None`. The size of the validation set. If `None`, + will use all validation examples in the sampler. + + """ + logger.info("Creating validation dataset.") + t_i = time() + self._validation_data, self._all_validation_targets, self._all_validation_inds = \ + self.sampler.get_validation_set( + self.batch_size, n_samples=n_samples) + t_f = time() + logger.info(("{0} s to load {1} validation examples ({2} validation " + "batches) to evaluate after each training step.").format( + t_f - t_i, + len(self._validation_data) * self.batch_size, + len(self._validation_data))) + + def create_test_set(self): + """ + Loads the set of test samples. + We do not create the test set in the `TrainModel` object until + this method is called, so that we avoid having to load it into + memory until the model has been trained and is ready to be + evaluated. + + """ + logger.info("Creating test dataset.") + t_i = time() + self._test_data, self._all_test_targets, self._all_test_inds = \ + self.sampler.get_test_set( + self.batch_size, n_samples=self._n_test_samples) + t_f = time() + logger.info(("{0} s to load {1} test examples ({2} test batches) " + "to evaluate after all training steps.").format( + t_f - t_i, + len(self._test_data) * self.batch_size, + len(self._test_data))) + np.savez_compressed( + os.path.join(self.output_dir, "test_targets.npz"), + data=self._all_test_targets) + + def _get_batch(self): + """ + Fetches a mini-batch of examples + + Returns + ------- + tuple(numpy.ndarray, numpy.ndarray) + A tuple containing the examples and targets. + + """ + t_i_sampling = time() + batch_sequences, batch_targets, batch_inds = self.sampler.sample( + batch_size=self.batch_size) + t_f_sampling = time() + logger.debug( + ("[BATCH] Time to sample {0} examples: {1} s.").format( + self.batch_size, + t_f_sampling - t_i_sampling)) + return (batch_sequences, batch_targets, batch_inds) + + def _checkpoint(self): + checkpoint_dict = { + "step": self.step, + "arch": self.model.__class__.__name__, + "state_dict": self.model.state_dict(), + "min_loss": self._min_loss, + "optimizer": self.optimizer.state_dict() + } + if self._save_new_checkpoints is not None and \ + self._save_new_checkpoints >= self.step: + checkpoint_filename = "checkpoint-{0}".format( + strftime("%m%d%H%M%S")) + self._save_checkpoint( + checkpoint_dict, False, filename=checkpoint_filename) + logger.debug("Saving checkpoint `{0}.pth.tar`".format( + checkpoint_filename)) + else: + self._save_checkpoint( + checkpoint_dict, False) + + def train_and_validate(self): + """ + Trains the model and measures validation performance. + + """ + for step in range(self._start_step, self.max_steps): + self.step = step + self.train() + + if step % self.nth_step_save_checkpoint == 0: + self._checkpoint() + if self.step and self.step % self.nth_step_report_stats == 0: + self.validate() + if self._early_stopping and self._stopping_reached: + logger.debug("Patience ran out. Stopping early.") + break + + self.sampler.save_dataset_to_file("train", close_filehandle=True) + + def train(self): + """ + Trains the model on a batch of data. + + Returns + ------- + float + The training loss. + + """ + t_i = time() + self.model.train() + self.sampler.set_mode("train") + + inputs, targets, inds = self._get_batch() + inputs = torch.Tensor(inputs) + targets = torch.Tensor(targets) + inds = torch.Tensor(inds) + + if self.use_cuda: + inputs = inputs.cuda() + targets = targets.cuda() + inds = inds.cuda() + + cls_pred, reg_pred = self.model(inputs.transpose(1, 2)) + cls_loss = self.cls_criterion(cls_pred, inds) + + reg_loss = 0 + inds = inds.ravel() + if len(inds[inds == 1]) != 0: + subset_pred = reg_pred[inds == 1] + subset_tgt = targets[inds == 1] + mask = torch.isnan(subset_tgt) + reg_loss = self.reg_criterion( + subset_pred[~mask], subset_tgt[~mask]) + # TODO: write a custom loss fn based on this? + #reg_loss = self.reg_criterion( + # subset_pred, subset_tgt, reduction='none').nanmean(dim=1).mean() + + self.optimizer.zero_grad() + total_loss = cls_loss + if torch.is_tensor(reg_loss): + total_loss += reg_loss + #print("cls_loss", cls_loss) + #print("reg_loss", reg_loss) + total_loss.backward() + self.optimizer.step() + self._train_loss.append(total_loss.item()) + t_f = time() + + self._time_per_step.append(t_f - t_i) + if self.step and self.step % self.nth_step_report_stats == 0: + logger.info(("[STEP {0}] average number " + "of steps per second: {1:.1f}").format( + self.step, 1. / np.average(self._time_per_step))) + self._train_logger.info(np.average(self._train_loss)) + logger.info("training loss: {0}".format(np.average(self._train_loss))) + self._time_per_step = [] + self._train_loss = [] + + + def _evaluate_on_data(self, data_in_batches): + """ + Makes predictions for some labeled input data. + + Parameters + ---------- + data_in_batches : list(tuple(numpy.ndarray, numpy.ndarray)) + A list of tuples of the data, where the first element is + the example, and the second element is the label. + + Returns + ------- + tuple(float, list(numpy.ndarray)) + Returns the average loss, and the list of all predictions. + + """ + self.model.eval() + + batch_losses = [] + all_predictions = [] + all_ind_predictions = [] + for (seqs, targets, inds) in data_in_batches: + seqs = torch.Tensor(seqs) + targets = torch.Tensor(targets) + inds = torch.Tensor(inds) + if self.use_cuda: + seqs = seqs.cuda() + targets = targets.cuda() + inds = inds.cuda() + + with torch.no_grad(): + cls_pred, reg_pred = self.model(seqs.transpose(1, 2)) + cls_loss = self.cls_criterion(cls_pred, inds.reshape((len(inds), 1))) + + reg_loss = 0 + if len(inds[inds == 1]) != 0: + subset_pred = reg_pred[inds == 1] + subset_tgt = targets[inds == 1] + + reg_loss = self.reg_criterion( + subset_pred, subset_tgt, reduction='none').nanmean(dim=1).mean() + + all_predictions.append( + reg_pred.data.cpu().numpy()) + all_ind_predictions.append( + cls_pred.data.cpu().numpy()) + + total_loss = cls_loss + if torch.is_tensor(reg_loss): + total_loss += reg_loss + batch_losses.append(total_loss.item()) + all_predictions = np.vstack(all_predictions) + all_ind_predictions = np.vstack(all_ind_predictions) + return np.average(batch_losses), all_ind_predictions, all_predictions + + def validate(self): + """ + Measures model validation performance. + + Returns + ------- + dict + A dictionary, where keys are the names of the loss metrics, + and the values are the average value for that metric over + the validation set. + + """ + validation_loss, val_pred_inds, val_pred_tgts = self._evaluate_on_data( + self._validation_data) + cls_valid_scores = self._validation_metrics.update( + val_pred_inds, self._all_validation_inds.reshape(len(val_pred_inds), 1), + scores=['cls_average_precision', 'cls_f1']) + + #reg_valid_scores = self._validation_metrics.update( + # val_pred_tgts[self._all_validation_inds == 1], + # self._all_validation_targets[self._all_validation_inds == 1], + # scores=['reg_roc_auc', 'reg_average_precision']) + + valid_scores = {} + for name, score in cls_valid_scores.items(): + logger.info("validation {0}: {1}".format(name, score)) + valid_scores[name] = score + #for name, score in reg_valid_scores.items(): + # logger.info("validation {0}: {1}".format(name, score)) + # valid_scores[name] = score + + valid_scores["loss"] = validation_loss + + to_log = [str(validation_loss)] + for k in sorted(self._validation_metrics.metrics.keys()): + if k in valid_scores and valid_scores[k]: + to_log.append(str(valid_scores[k])) + else: + to_log.append("NA") + self._validation_logger.info("\t".join(to_log)) + + # scheduler update + if self._use_scheduler: + self.scheduler.step( + math.ceil(validation_loss * 1000.0) / 1000.0) + + # save best_model + if validation_loss < self._min_loss: + self._min_loss = validation_loss + self._save_checkpoint({ + "step": self.step, + "arch": self.model.__class__.__name__, + "state_dict": self.model.state_dict(), + "min_loss": self._min_loss, + "optimizer": self.optimizer.state_dict()}, True) + logger.debug("Updating `best_model.pth.tar`") + logger.info("validation loss: {0}".format(validation_loss)) + + # check for early stopping + if self._early_stopping: + stopping_metric = self._validation_metrics.metrics[self._stopping_metric].data + index = np.argmax(stopping_metric) + if self._stopping_patience - (len(stopping_metric) - index - 1) <= 0: + self._stopping_reached = True + + def evaluate(self): + """ + Measures the model test performance. + + Returns + ------- + dict + A dictionary, where keys are the names of the loss metrics, + and the values are the average value for that metric over + the test set. + + """ + if self._test_data is None: + self.create_test_set() + average_loss, test_pred_inds, test_pred_tgts = self._evaluate_on_data( + self._test_data) + + cls_test_scores = self._test_metrics.update( + test_pred_inds, self._all_test_inds.reshape(len(val_pred_inds), 1), + scores=['cls_average_precision', 'cls_f1',]) + + #reg_test_scores = self._test_metrics.update( + # test_pred_tgts[self._all_test_inds == 1], + # self._all_test_targets[self._all_test_inds == 1], + # scores=['reg_roc_auc', 'reg_average_precision']) + + np.savez_compressed( + os.path.join(self.output_dir, "test_predictions.npz"), + data=test_pred_tgts) + np.savez_compressed( + os.path.join(self.output_dir, "test_predictions.indicator.npz"), + data=test_pred_inds) + + for name, score in cls_test_scores.items(): + logger.info("test {0}: {1}".format(name, score)) + #for name, score in reg_test_scores.items(): + # logger.info("test {0}: {1}".format(name, score)) + + test_performance = os.path.join( + self.output_dir, "test_performance.txt") + feature_scores_dict = self._test_metrics.write_feature_scores_to_file( + test_performance) + + average_scores["loss"] = average_loss + + return (average_scores, feature_scores_dict) + + def _save_checkpoint(self, + state, + is_best, + filename="checkpoint"): + """ + Saves snapshot of the model state to file. Will save a checkpoint + with name `.pth.tar` and, if this is the model's best + performance so far, will save the state to a `best_model.pth.tar` + file as well. + + Models are saved in the state dictionary format. This is a more + stable format compared to saving the whole model (which is another + option supported by PyTorch). Note that we do save a number of + additional, Selene-specific parameters in the dictionary + and that the actual `model.state_dict()` is stored in the `state_dict` + key of the dictionary loaded by `torch.load`. + + See: https://pytorch.org/docs/stable/notes/serialization.html for more + information about how models are saved in PyTorch. + + Parameters + ---------- + state : dict + Information about the state of the model. Note that this is + not `model.state_dict()`, but rather, a dictionary containing + keys that can be used for continued training in Selene + _in addition_ to a key `state_dict` that contains + `model.state_dict()`. + is_best : bool + Is this the model's best performance so far? + filename : str, optional + Default is "checkpoint". Specify the checkpoint filename. Will + append a file extension to the end of the `filename` + (e.g. `checkpoint.pth.tar`). + + Returns + ------- + None + + """ + logger.debug("[TRAIN] {0}: Saving model state to file.".format( + state["step"])) + cp_filepath = os.path.join( + self.output_dir, filename) + torch.save(state, "{0}.pth.tar".format(cp_filepath)) + if is_best: + best_filepath = os.path.join(self.output_dir, "best_model") + shutil.copyfile("{0}.pth.tar".format(cp_filepath), + "{0}.pth.tar".format(best_filepath)) diff --git a/selene_sdk/utils/methylation_performance_metrics.py b/selene_sdk/utils/methylation_performance_metrics.py new file mode 100644 index 00000000..fec5f5e3 --- /dev/null +++ b/selene_sdk/utils/methylation_performance_metrics.py @@ -0,0 +1,315 @@ +""" +This module provides the `PerformanceMetrics` class and supporting +functionality for tracking and computing model performance. +""" +from collections import defaultdict, namedtuple +import logging +import os + +import numpy as np +from sklearn.metrics import average_precision_score +from sklearn.metrics import precision_recall_curve +from sklearn.metrics import roc_auc_score +from sklearn.metrics import roc_curve +from scipy.stats import rankdata + + +logger = logging.getLogger("selene") + + +Metric = namedtuple("Metric", ["fn", "data"]) +""" +A tuple containing a metric function and the results from applying that +metric to some values. + +Parameters +---------- +fn : types.FunctionType + A metric. +data : list(float) + A list holding the results from applying the metric. + +Attributes +---------- +fn : types.FunctionType + A metric. +data : list(float) + A list holding the results from applying the metric. + +""" + + +def compute_score(prediction, target, metric_fn, + report_gt_feature_n_positives=10): + """ + Using a user-specified metric, computes the distance between + two tensors. + + Parameters + ---------- + prediction : numpy.ndarray + Value predicted by user model. + target : numpy.ndarray + True value that the user model was trying to predict. + metric_fn : types.FunctionType + A metric that can measure the distance between the prediction + and target variables. + report_gt_feature_n_positives : int, optional + Default is 10. The minimum number of positive examples for a + feature in order to compute the score for it. + + Returns + ------- + average_score, feature_scores : tuple(float, numpy.ndarray) + A tuple containing the average of all feature scores, and a + vector containing the scores for each feature. If there were + no features meeting our filtering thresholds, will return + `(None, [])`. + """ + feature_scores = np.ones(target.shape[1]) * np.nan + # Deal with the case of multi-class classification, where each example only has one target value but multiple prediction values + if target.shape[1] == 1 and prediction.shape[1] > 1: + prediction = [prediction] + else: + prediction = prediction.T + for index, feature_preds in enumerate(prediction): + feature_targets = target[:, index] + feature_targets = feature_targets[~np.isnan(feature_targets)] + feature_preds = feature_preds[~np.isnan(feature_preds)] + if len(np.unique(feature_targets)) > 0 and \ + np.count_nonzero(feature_targets) > report_gt_feature_n_positives: + try: + feature_scores[index] = metric_fn( + feature_targets, feature_preds) + except ValueError: # do I need to make this more generic? + continue + valid_feature_scores = [s for s in feature_scores if not np.isnan(s)] # Allow 0 or negative values. + if not valid_feature_scores: + return None, feature_scores + average_score = np.average(valid_feature_scores) + return average_score, feature_scores + + +def get_feature_specific_scores(data, get_feature_from_index_fn): + """ + Generates a dictionary mapping feature names to feature scores from + an intermediate representation. + + Parameters + ---------- + data : list(tuple(int, float)) + A list of tuples, where each tuple contains a feature's index + and the score for that feature. + get_feature_from_index_fn : types.FunctionType + A function that takes an index (`int`) and returns a feature + name (`str`). + + Returns + ------- + dict + A dictionary mapping feature names (`str`) to scores (`float`). + If there was no score for a feature, its score will be set to + `None`. + + """ + feature_score_dict = {} + for index, score in enumerate(data): + feature = get_feature_from_index_fn(index) + if not np.isnan(score): + feature_score_dict[feature] = score + else: + feature_score_dict[feature] = None + return feature_score_dict + + +def auc_u_test(labels, predictions): + """ + Outputs the area under the the ROC curve associated with a certain + set of labels and the predictions given by the training model. + Computed from the U statistic. + + Parameters + ---------- + labels: numpy.ndarray + Known labels of values predicted by model. Must be one dimensional. + predictions: numpy.ndarray + Value predicted by user model. Must be one dimensional, with matching + dimension to `labels` + + Returns + ------- + float + AUC value of given label, prediction pairs + + """ + len_pos = int(np.sum(labels)) + len_neg = len(labels) - len_pos + rank_sum = np.sum(rankdata(predictions)[labels == 1]) + u_value = rank_sum - (len_pos * (len_pos + 1)) / 2 + auc = u_value / (len_pos * len_neg) + return auc + + +class MethylationPerformanceMetrics(object): + """ + Tracks and calculates metrics to evaluate how closely a model's + predictions match the true values it was designed to predict. + + Parameters + ---------- + get_feature_from_index_fn : types.FunctionType + A function that takes an index (`int`) and returns a feature + name (`str`). + report_gt_feature_n_positives : int, optional + Default is 10. The minimum number of positive examples for a + feature in order to compute the score for it. + metrics : dict + A dictionary that maps metric names (`str`) to metric functions. + By default, this contains `"roc_auc"`, which maps to + `sklearn.metrics.roc_auc_score`, and `"average_precision"`, + which maps to `sklearn.metrics.average_precision_score`. + + + + Attributes + ---------- + skip_threshold : int + The minimum number of positive examples of a feature that must + be included in an update for a metric score to be + calculated for it. + get_feature_from_index : types.FunctionType + A function that takes an index (`int`) and returns a feature + name (`str`). + metrics : dict + A dictionary that maps metric names (`str`) to metric objects + (`Metric`). By default, this contains `"roc_auc"` and + `"average_precision"`. + + """ + + def __init__(self, + get_feature_from_index_fn, + report_gt_feature_n_positives=10, + metrics=dict(roc_auc=roc_auc_score, + average_precision=average_precision_score)): + """ + Creates a new object of the `PerformanceMetrics` class. + """ + self.skip_threshold = report_gt_feature_n_positives + self.get_feature_from_index = get_feature_from_index_fn + self.metrics = dict() + for k, v in metrics.items(): + self.metrics[k] = Metric(fn=v, data=[]) + + def add_metric(self, name, metric_fn): + """ + Begins tracking of the specified metric. + + Parameters + ---------- + name : str + The name of the metric. + metric_fn : types.FunctionType + A metric function. + + """ + self.metrics[name] = Metric(fn=metric_fn, data=[]) + + def remove_metric(self, name): + """ + Ends the tracking of the specified metric, and returns the + previous scores associated with that metric. + + Parameters + ---------- + name : str + The name of the metric. + + Returns + ------- + list(float) + The list of feature-specific scores obtained by previous + uses of the specified metric. + + """ + data = self.metrics[name].data + del self.metrics[name] + return data + + def update(self, prediction, target, scores=None): + """ + Evaluates the tracked metrics on a model prediction and its + target value, and adds this to the metric histories. + + Parameters + ---------- + prediction : numpy.ndarray + Value predicted by user model. + target : numpy.ndarray + True value that the user model was trying to predict. + + Returns + ------- + dict + A dictionary mapping each metric names (`str`) to the + average score of that metric across all features + (`float`). + + """ + if scores is None: + scores = list(self.metrics.keys()) + + metric_scores = {} + for name in scores: + metric = self.metrics[name] + avg_score, feature_scores = compute_score( + prediction, target, metric.fn, + report_gt_feature_n_positives=self.skip_threshold) + metric.data.append(feature_scores) + metric_scores[name] = avg_score + return metric_scores + + def write_feature_scores_to_file(self, output_path): + """ + Writes each metric's score for each feature to a specified + file. + + Parameters + ---------- + output_path : str + The path to the output file where performance metrics will + be written. + + Returns + ------- + dict + A dictionary mapping feature names (`str`) to + sub-dictionaries (`dict`). Each sub-dictionary then maps + metric names (`str`) to the score for that metric on the + given feature. If a metric was not evaluated on a given + feature, the score will be `None`. + + """ + feature_scores = defaultdict(dict) + for name, metric in self.metrics.items(): + feature_score_dict = get_feature_specific_scores( + metric.data[-1], self.get_feature_from_index) + for feature, score in feature_score_dict.items(): + if score is None: + feature_scores[feature] = None + else: + feature_scores[feature][name] = score + + metric_cols = [m for m in self.metrics.keys()] + cols = '\t'.join(["class"] + metric_cols) + with open(output_path, 'w+') as file_handle: + file_handle.write("{0}\n".format(cols)) + for feature, metric_scores in feature_scores.items(): + if not metric_scores: + file_handle.write("{0}\t{1}\n".format(feature, "\t".join(["NA"] * len(metric_cols)))) + else: + metric_score_cols = '\t'.join( + ["{0:.4f}".format(s) for s in metric_scores.values()]) + file_handle.write("{0}\t{1}\n".format(feature, + metric_score_cols)) + return feature_scores diff --git a/selene_sdk/utils/non_strand_specific_module.py b/selene_sdk/utils/non_strand_specific_module.py index 367a46a9..b69e6e00 100644 --- a/selene_sdk/utils/non_strand_specific_module.py +++ b/selene_sdk/utils/non_strand_specific_module.py @@ -67,11 +67,12 @@ def forward(self, input): else: reverse_input = _flip(_flip(input, 1), 2) - output = self.model.forward(input) - output_from_rev = self.model.forward(reverse_input) + output1, output2 = self.model.forward(input) + output_from_rev1, output_from_rev2 = self.model.forward(reverse_input) if self.mode == "mean": - return (output + output_from_rev) / 2 + return ((output1 + output_from_rev1) / 2, (output2 + output_from_rev2) / 2) else: - return torch.max(output, output_from_rev) + return torch.max(output1, output_from_rev1), torch.max(output2, output_from_rev2) + From 81e8e454c50eedc395339d8abf44496513dd0b00 Mon Sep 17 00:00:00 2001 From: Kathy Date: Thu, 15 Dec 2022 10:09:08 -0500 Subject: [PATCH 02/62] remove f1 metric --- selene_sdk/train_methylation_model.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/selene_sdk/train_methylation_model.py b/selene_sdk/train_methylation_model.py index f6a79a75..358bcd74 100644 --- a/selene_sdk/train_methylation_model.py +++ b/selene_sdk/train_methylation_model.py @@ -14,7 +14,6 @@ import torch.nn.functional as F from torch.optim.lr_scheduler import ReduceLROnPlateau from sklearn.metrics import average_precision_score -from sklearn.metrics import f1_score from sklearn.metrics import roc_auc_score from .utils import initialize_logger @@ -273,8 +272,7 @@ def __init__(self, logger.debug("Set modules to use CUDA") self._report_gt_feature_n_positives = report_gt_feature_n_positives - self._metrics = dict(cls_average_precision=average_precision_score, - cls_f1=f1_score) + self._metrics = dict(cls_average_precision=average_precision_score) self._n_validation_samples = n_validation_samples self._n_test_samples = n_test_samples self._use_scheduler = use_scheduler From 1b14ecc245e77973c6b2383cfefdf59435e1a187 Mon Sep 17 00:00:00 2001 From: Kathy Date: Fri, 16 Dec 2022 10:30:03 -0500 Subject: [PATCH 03/62] bugfix and some temporary profiling --- selene_sdk/train_methylation_model.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/selene_sdk/train_methylation_model.py b/selene_sdk/train_methylation_model.py index 358bcd74..1740206f 100644 --- a/selene_sdk/train_methylation_model.py +++ b/selene_sdk/train_methylation_model.py @@ -492,7 +492,14 @@ def train(self): targets = targets.cuda() inds = inds.cuda() + torch.cuda.synchronize() + mtime1 = time() cls_pred, reg_pred = self.model(inputs.transpose(1, 2)) + torch.cuda.synchronize() + mtime2 = time() + print("model time: {0}".format(mtime2 - mtime1)) + + ltime1 = time() cls_loss = self.cls_criterion(cls_pred, inds) reg_loss = 0 @@ -506,6 +513,8 @@ def train(self): # TODO: write a custom loss fn based on this? #reg_loss = self.reg_criterion( # subset_pred, subset_tgt, reduction='none').nanmean(dim=1).mean() + ltime2 = time() + print("loss computation time: {0}".format(ltime2 - ltime1)) self.optimizer.zero_grad() total_loss = cls_loss @@ -515,6 +524,9 @@ def train(self): #print("reg_loss", reg_loss) total_loss.backward() self.optimizer.step() + ltime3 = time() + print("backprop time: {0}".format(ltime3 - ltime2)) + self._train_loss.append(total_loss.item()) t_f = time() @@ -600,7 +612,7 @@ def validate(self): self._validation_data) cls_valid_scores = self._validation_metrics.update( val_pred_inds, self._all_validation_inds.reshape(len(val_pred_inds), 1), - scores=['cls_average_precision', 'cls_f1']) + scores=['cls_average_precision']) #reg_valid_scores = self._validation_metrics.update( # val_pred_tgts[self._all_validation_inds == 1], @@ -668,7 +680,7 @@ def evaluate(self): cls_test_scores = self._test_metrics.update( test_pred_inds, self._all_test_inds.reshape(len(val_pred_inds), 1), - scores=['cls_average_precision', 'cls_f1',]) + scores=['cls_average_precision']) #reg_test_scores = self._test_metrics.update( # test_pred_tgts[self._all_test_inds == 1], From 3be461800ad3fc90d508fbfa7fc56a6c256712be Mon Sep 17 00:00:00 2001 From: Kathy Date: Mon, 19 Dec 2022 09:57:25 -0500 Subject: [PATCH 04/62] adjustments to loss breakdown and logging --- selene_sdk/train_methylation_model.py | 120 ++++++++++-------- .../utils/non_strand_specific_module.py | 14 +- 2 files changed, 79 insertions(+), 55 deletions(-) diff --git a/selene_sdk/train_methylation_model.py b/selene_sdk/train_methylation_model.py index 1740206f..f58547f6 100644 --- a/selene_sdk/train_methylation_model.py +++ b/selene_sdk/train_methylation_model.py @@ -10,11 +10,13 @@ import numpy as np import torch +import torch.autograd.profiler as profiler import torch.nn as nn import torch.nn.functional as F from torch.optim.lr_scheduler import ReduceLROnPlateau from sklearn.metrics import average_precision_score from sklearn.metrics import roc_auc_score +from sklearn.metrics import mean_squared_error from .utils import initialize_logger from .utils import load_model_from_state_dict @@ -272,7 +274,8 @@ def __init__(self, logger.debug("Set modules to use CUDA") self._report_gt_feature_n_positives = report_gt_feature_n_positives - self._metrics = dict(cls_average_precision=average_precision_score) + self._metrics = dict(cls_average_precision=average_precision_score,) + #reg_mse=mean_squared_error) self._n_validation_samples = n_validation_samples self._n_test_samples = n_test_samples self._use_scheduler = use_scheduler @@ -351,7 +354,7 @@ def _init_validate(self): self._validation_logger = _metrics_logger( "{0}.validation".format(__name__), self.output_dir) - self._validation_logger.info("\t".join(["loss"] + + self._validation_logger.info("\t".join(["loss", "mse", "bce"] + sorted([x for x in self._validation_metrics.metrics.keys()]))) def _init_test(self): @@ -492,15 +495,22 @@ def train(self): targets = targets.cuda() inds = inds.cuda() + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + #with profiler.profile(with_stack=True, profile_memory=True) as prof: + #cls_pred, reg_pred = self.model(inputs.transpose(1, 2)) + start.record() + pred = self.model(inputs.transpose(1, 2)) + end.record() torch.cuda.synchronize() - mtime1 = time() - cls_pred, reg_pred = self.model(inputs.transpose(1, 2)) - torch.cuda.synchronize() - mtime2 = time() - print("model time: {0}".format(mtime2 - mtime1)) + print("model time: {0}".format(start.elapsed_time(end))) + #print("model time: {0}".format(mtime2 - mtime1)) - ltime1 = time() - cls_loss = self.cls_criterion(cls_pred, inds) + #print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total', row_limit=5)) + cls_pred = pred[:, 0] + reg_pred = pred[:, 1:] + cls_loss = self.cls_criterion(cls_pred, inds.squeeze()) reg_loss = 0 inds = inds.ravel() @@ -513,19 +523,13 @@ def train(self): # TODO: write a custom loss fn based on this? #reg_loss = self.reg_criterion( # subset_pred, subset_tgt, reduction='none').nanmean(dim=1).mean() - ltime2 = time() - print("loss computation time: {0}".format(ltime2 - ltime1)) self.optimizer.zero_grad() total_loss = cls_loss if torch.is_tensor(reg_loss): total_loss += reg_loss - #print("cls_loss", cls_loss) - #print("reg_loss", reg_loss) total_loss.backward() self.optimizer.step() - ltime3 = time() - print("backprop time: {0}".format(ltime3 - ltime2)) self._train_loss.append(total_loss.item()) t_f = time() @@ -559,7 +563,7 @@ def _evaluate_on_data(self, data_in_batches): """ self.model.eval() - batch_losses = [] + batch_bce, batch_mse, batch_losses = [], [], [] all_predictions = [] all_ind_predictions = [] for (seqs, targets, inds) in data_in_batches: @@ -572,29 +576,48 @@ def _evaluate_on_data(self, data_in_batches): inds = inds.cuda() with torch.no_grad(): - cls_pred, reg_pred = self.model(seqs.transpose(1, 2)) - cls_loss = self.cls_criterion(cls_pred, inds.reshape((len(inds), 1))) + #cls_pred, reg_pred = self.model(seqs.transpose(1, 2)) + pred = self.model(seqs.transpose(1, 2)) + cls_pred = pred[:, 0] + reg_pred = pred[:, 1:] + cls_loss = self.cls_criterion(cls_pred, inds) reg_loss = 0 if len(inds[inds == 1]) != 0: subset_pred = reg_pred[inds == 1] subset_tgt = targets[inds == 1] - + mask = torch.isnan(subset_tgt) reg_loss = self.reg_criterion( - subset_pred, subset_tgt, reduction='none').nanmean(dim=1).mean() - - all_predictions.append( - reg_pred.data.cpu().numpy()) - all_ind_predictions.append( - cls_pred.data.cpu().numpy()) - + subset_pred[~mask], subset_tgt[~mask]) + #subset_pred = reg_pred[inds == 1] + #subset_tgt = targets[inds == 1] + #reg_loss = self.reg_criterion( + # subset_pred, subset_tgt, reduction='none').nanmean(dim=1).mean() + + all_predictions.append(reg_pred) + all_ind_predictions.append(cls_pred) + #all_predictions.append( + # reg_pred.data.cpu().numpy()) + #all_ind_predictions.append( + # cls_pred.data.cpu().numpy()) + + batch_bce.append(cls_loss.item()) total_loss = cls_loss if torch.is_tensor(reg_loss): total_loss += reg_loss + batch_mse.append(reg_loss.item()) batch_losses.append(total_loss.item()) - all_predictions = np.vstack(all_predictions) - all_ind_predictions = np.vstack(all_ind_predictions) - return np.average(batch_losses), all_ind_predictions, all_predictions + all_predictions = torch.vstack(all_predictions) + all_ind_predictions = torch.hstack(all_ind_predictions)[:, None] + all_predictions = all_predictions.data.cpu().numpy() + all_ind_predictions = all_ind_predictions.data.cpu().numpy() + + loss_breakdown = { + 'bce': np.average(batch_bce), + 'mse': np.average(batch_mse), + 'loss': np.average(batch_losses) + } + return loss_breakdown, all_ind_predictions, all_predictions def validate(self): """ @@ -608,35 +631,30 @@ def validate(self): the validation set. """ - validation_loss, val_pred_inds, val_pred_tgts = self._evaluate_on_data( + lossdict, val_pred_inds, val_pred_tgts = self._evaluate_on_data( self._validation_data) cls_valid_scores = self._validation_metrics.update( val_pred_inds, self._all_validation_inds.reshape(len(val_pred_inds), 1), scores=['cls_average_precision']) - #reg_valid_scores = self._validation_metrics.update( # val_pred_tgts[self._all_validation_inds == 1], # self._all_validation_targets[self._all_validation_inds == 1], - # scores=['reg_roc_auc', 'reg_average_precision']) - - valid_scores = {} + # scores=['reg_mse']) for name, score in cls_valid_scores.items(): logger.info("validation {0}: {1}".format(name, score)) - valid_scores[name] = score + lossdict[name] = score #for name, score in reg_valid_scores.items(): # logger.info("validation {0}: {1}".format(name, score)) - # valid_scores[name] = score - - valid_scores["loss"] = validation_loss + # lossdict[name] = score - to_log = [str(validation_loss)] + to_log = [lossdict['loss'], lossdict['mse'], lossdict['bce']] for k in sorted(self._validation_metrics.metrics.keys()): - if k in valid_scores and valid_scores[k]: - to_log.append(str(valid_scores[k])) + if k in lossdict and lossdict[k]: + to_log.append(lossdict[k]) else: to_log.append("NA") - self._validation_logger.info("\t".join(to_log)) - + self._validation_logger.info('\t'.join([str(s) for s in to_log])) + validation_loss = lossdict['loss'] # scheduler update if self._use_scheduler: self.scheduler.step( @@ -675,17 +693,17 @@ def evaluate(self): """ if self._test_data is None: self.create_test_set() - average_loss, test_pred_inds, test_pred_tgts = self._evaluate_on_data( + losdict, test_pred_inds, test_pred_tgts = self._evaluate_on_data( self._test_data) - cls_test_scores = self._test_metrics.update( - test_pred_inds, self._all_test_inds.reshape(len(val_pred_inds), 1), + test_scores = self._test_metrics.update( + test_pred_inds, self._all_test_inds.reshape(len(test_pred_inds), 1), scores=['cls_average_precision']) #reg_test_scores = self._test_metrics.update( # test_pred_tgts[self._all_test_inds == 1], # self._all_test_targets[self._all_test_inds == 1], - # scores=['reg_roc_auc', 'reg_average_precision']) + # scores=['reg_mse']) np.savez_compressed( os.path.join(self.output_dir, "test_predictions.npz"), @@ -694,19 +712,19 @@ def evaluate(self): os.path.join(self.output_dir, "test_predictions.indicator.npz"), data=test_pred_inds) - for name, score in cls_test_scores.items(): + for name, score in test_scores.items(): logger.info("test {0}: {1}".format(name, score)) + lossdict[name] = score #for name, score in reg_test_scores.items(): # logger.info("test {0}: {1}".format(name, score)) + # lossdict[name] = score test_performance = os.path.join( self.output_dir, "test_performance.txt") feature_scores_dict = self._test_metrics.write_feature_scores_to_file( test_performance) - average_scores["loss"] = average_loss - - return (average_scores, feature_scores_dict) + return (lossdict, feature_scores_dict) def _save_checkpoint(self, state, diff --git a/selene_sdk/utils/non_strand_specific_module.py b/selene_sdk/utils/non_strand_specific_module.py index b69e6e00..7f4b5163 100644 --- a/selene_sdk/utils/non_strand_specific_module.py +++ b/selene_sdk/utils/non_strand_specific_module.py @@ -67,12 +67,18 @@ def forward(self, input): else: reverse_input = _flip(_flip(input, 1), 2) - output1, output2 = self.model.forward(input) - output_from_rev1, output_from_rev2 = self.model.forward(reverse_input) + #output1, output2 = self.model.forward(input) + #output_from_rev1, output_from_rev2 = self.model.forward(reverse_input) + output = self.model.forward(input) + output_from_rev = self.model.forward(reverse_input) if self.mode == "mean": - return ((output1 + output_from_rev1) / 2, (output2 + output_from_rev2) / 2) + return (output + output_from_rev) / 2 else: - return torch.max(output1, output_from_rev1), torch.max(output2, output_from_rev2) + return torch.max(output, output_from_rev) + #if self.mode == "mean": + # return ((output1 + output_from_rev1) / 2, (output2 + output_from_rev2) / 2) + #else: + # return torch.max(output1, output_from_rev1), torch.max(output2, output_from_rev2) From 20eea13ac35d76b81ee6ea06712cac20c8e2b194 Mon Sep 17 00:00:00 2001 From: Kathy Date: Mon, 19 Dec 2022 10:35:04 -0500 Subject: [PATCH 05/62] adjust return type for get_features_data --- .../samplers/methylation_data_sampler.py | 6 +++--- selene_sdk/targets/genomic_features_h5.py | 6 +++--- selene_sdk/train_methylation_model.py | 18 ++++++------------ 3 files changed, 12 insertions(+), 18 deletions(-) diff --git a/selene_sdk/samplers/methylation_data_sampler.py b/selene_sdk/samplers/methylation_data_sampler.py index b2c68719..2211f8a1 100644 --- a/selene_sdk/samplers/methylation_data_sampler.py +++ b/selene_sdk/samplers/methylation_data_sampler.py @@ -244,9 +244,9 @@ def _partition_genome_by_chromosome(self): def _retrieve(self, chrom, position): bin_start = position - self._start_radius bin_end = position + self._end_radius - indicator = self.target.is_positive( - chrom, bin_start, bin_end) - retrieved_targets = self.target.get_feature_data( + #indicator = self.target.is_positive( + # chrom, bin_start, bin_end) + indicator, retrieved_targets = self.target.get_feature_data( chrom, bin_start, bin_end) window_start = position - self._start_window_radius window_end = position + self._end_window_radius diff --git a/selene_sdk/targets/genomic_features_h5.py b/selene_sdk/targets/genomic_features_h5.py index cd3bd796..73a9d37e 100644 --- a/selene_sdk/targets/genomic_features_h5.py +++ b/selene_sdk/targets/genomic_features_h5.py @@ -246,11 +246,11 @@ def get_feature_data(self, chrom, start, end): nans = np.zeros(self.n_targets) * np.nan rows = self._query_tabix(chrom, start, end) if not rows: - return nans + return False, nans row_targets = [] for r in rows: target = int(r[3]) row_targets.append(self.data[target]) if len(row_targets) == 0: - return nans - return np.average(np.vstack(row_targets), axis=0) + return True, nans + return True, np.average(np.vstack(row_targets), axis=0) diff --git a/selene_sdk/train_methylation_model.py b/selene_sdk/train_methylation_model.py index f58547f6..60deab51 100644 --- a/selene_sdk/train_methylation_model.py +++ b/selene_sdk/train_methylation_model.py @@ -498,16 +498,13 @@ def train(self): start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) - #with profiler.profile(with_stack=True, profile_memory=True) as prof: #cls_pred, reg_pred = self.model(inputs.transpose(1, 2)) - start.record() + #start.record() pred = self.model(inputs.transpose(1, 2)) - end.record() - torch.cuda.synchronize() - print("model time: {0}".format(start.elapsed_time(end))) - #print("model time: {0}".format(mtime2 - mtime1)) + #end.record() + #torch.cuda.synchronize() + #print("model time: {0}".format(start.elapsed_time(end))) - #print(prof.key_averages(group_by_stack_n=5).table(sort_by='self_cpu_time_total', row_limit=5)) cls_pred = pred[:, 0] reg_pred = pred[:, 1:] cls_loss = self.cls_criterion(cls_pred, inds.squeeze()) @@ -596,10 +593,6 @@ def _evaluate_on_data(self, data_in_batches): all_predictions.append(reg_pred) all_ind_predictions.append(cls_pred) - #all_predictions.append( - # reg_pred.data.cpu().numpy()) - #all_ind_predictions.append( - # cls_pred.data.cpu().numpy()) batch_bce.append(cls_loss.item()) total_loss = cls_loss @@ -670,7 +663,8 @@ def validate(self): "min_loss": self._min_loss, "optimizer": self.optimizer.state_dict()}, True) logger.debug("Updating `best_model.pth.tar`") - logger.info("validation loss: {0}".format(validation_loss)) + logger.info("validation loss: {0} (bce: {1}, mse: {2})".format( + validation_loss, lossdict['bce'], lossdict['mse'])) # check for early stopping if self._early_stopping: From d34bf84230a6ebe3e933dbb1efa688be9d833cae Mon Sep 17 00:00:00 2001 From: Kathy Date: Mon, 19 Dec 2022 11:47:41 -0500 Subject: [PATCH 06/62] bugfix for get_feature_data --- selene_sdk/targets/genomic_features_h5.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/selene_sdk/targets/genomic_features_h5.py b/selene_sdk/targets/genomic_features_h5.py index 73a9d37e..9f791010 100644 --- a/selene_sdk/targets/genomic_features_h5.py +++ b/selene_sdk/targets/genomic_features_h5.py @@ -245,12 +245,12 @@ def get_feature_data(self, chrom, start, end): """ nans = np.zeros(self.n_targets) * np.nan rows = self._query_tabix(chrom, start, end) - if not rows: + if rows is None: return False, nans row_targets = [] for r in rows: target = int(r[3]) row_targets.append(self.data[target]) - if len(row_targets) == 0: - return True, nans + if len(row_targets) == 0: # query was empty + return False, nans return True, np.average(np.vstack(row_targets), axis=0) From 3b6f4ad1c4a0d2ad2e5b4070655c3062c283ba80 Mon Sep 17 00:00:00 2001 From: Kathy Date: Tue, 20 Dec 2022 09:19:46 -0500 Subject: [PATCH 07/62] remove event object init --- selene_sdk/train_methylation_model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/selene_sdk/train_methylation_model.py b/selene_sdk/train_methylation_model.py index 60deab51..097affef 100644 --- a/selene_sdk/train_methylation_model.py +++ b/selene_sdk/train_methylation_model.py @@ -495,11 +495,11 @@ def train(self): targets = targets.cuda() inds = inds.cuda() - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) + #start = torch.cuda.Event(enable_timing=True) + #end = torch.cuda.Event(enable_timing=True) - #cls_pred, reg_pred = self.model(inputs.transpose(1, 2)) #start.record() + #cls_pred, reg_pred = self.model(inputs.transpose(1, 2)) pred = self.model(inputs.transpose(1, 2)) #end.record() #torch.cuda.synchronize() From c5bad67cd9b298ead5a45535a9e6d98163ce54c5 Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 11 Jan 2023 10:36:47 -0500 Subject: [PATCH 08/62] initial changes to h5 dataloader --- selene_sdk/samplers/dataloader.py | 54 ++++++++++++++++++------------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/selene_sdk/samplers/dataloader.py b/selene_sdk/samplers/dataloader.py index d6b07656..07f29e38 100644 --- a/selene_sdk/samplers/dataloader.py +++ b/selene_sdk/samplers/dataloader.py @@ -180,13 +180,15 @@ def dfunc(self, *args, **kwargs): self.db = h5py.File(self.file_path, 'r') if self.unpackbits: self.s_len = self.db['{0}_length'.format(self._sequence_key)][()] - self.t_len = self.db['{0}_length'.format(self._targets_key)][()] + #self.t_len = self.db['{0}_length'.format(self._targets_key)][()] if self.in_memory: self.sequences = np.asarray(self.db[self._sequence_key]) self.targets = np.asarray(self.db[self._targets_key]) + self.indicators = np.asarray(self.db['indicators']) else: self.sequences = self.db[self._sequence_key] self.targets = self.db[self._targets_key] + self.indicators = self.db['indicators'] self._initialized = True return func(self, *args, **kwargs) return dfunc @@ -202,18 +204,19 @@ def __getitem__(self, index): nulls = np.sum(sequence, axis=-1) == sequence.shape[-1] sequence = sequence.astype(float) sequence[nulls, :] = 1.0 / sequence.shape[-1] - targets = np.unpackbits( - targets, axis=-1).astype(float) + #targets = np.unpackbits( + # targets, axis=-1).astype(float) if sequence.ndim == 3: sequence = sequence[:, :self.s_len, :] else: sequence = sequence[:self.s_len, :] - if targets.ndim == 2: - targets = targets[:, :self.t_len] - else: - targets = targets[:self.t_len] + #if targets.ndim == 2: + # targets = targets[:, :self.t_len] + #else: + # targets = targets[:self.t_len] return (torch.from_numpy(sequence.astype(np.float32)), - torch.from_numpy(targets.astype(np.float32))) + torch.from_numpy(targets.astype(np.float32)), + self.indicators[index]) @init def __len__(self): @@ -288,20 +291,31 @@ class H5DataLoader(DataLoader): """ def __init__(self, - filepath, - in_memory=False, + dataset, num_workers=1, use_subset=None, batch_size=1, - shuffle=True, - unpackbits=False, - sequence_key="sequences", - targets_key="targets"): + seed=436, + sampler=None, + batch_sampler=None, + shuffle=True): + def worker_init_fn(worker_id): + np.random.seed(seed + worker_id) + args = { "batch_size": batch_size, - "num_workers": 0 if in_memory else num_workers, - "pin_memory": True + #"num_workers": 0 if in_memory else num_workers, + "pin_memory": True, + "worker_init_fn": worker_init_fn, + "sampler": sampler, + "batch_sampler": batch_sampler } + + if hasattr(dataset, 'in_memory'): + args['num_workers'] = 0 if dataset.in_memory else num_workers + else: + args['num_workers'] = num_workers + if use_subset is not None: from torch.utils.data.sampler import SubsetRandomSampler if isinstance(use_subset, int): @@ -311,10 +325,4 @@ def __init__(self, args["sampler"] = SubsetRandomSampler(use_subset) else: args["shuffle"] = shuffle - super(H5DataLoader, self).__init__( - _H5Dataset(filepath, - in_memory=in_memory, - unpackbits=unpackbits, - sequence_key=sequence_key, - targets_key=targets_key), - **args) + super(H5DataLoader, self).__init__(dataset, **args) From 5d5438a4e6d3a3ff04c245896d96f29b77fcb193 Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 11 Jan 2023 10:38:26 -0500 Subject: [PATCH 09/62] methylation sampler excl --- selene_sdk/samplers/methylation_data_sampler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/selene_sdk/samplers/methylation_data_sampler.py b/selene_sdk/samplers/methylation_data_sampler.py index 2211f8a1..1883a5c2 100644 --- a/selene_sdk/samplers/methylation_data_sampler.py +++ b/selene_sdk/samplers/methylation_data_sampler.py @@ -171,7 +171,7 @@ def _partition_genome_by_proportion(self): for chrom, len_chrom in self.reference_sequence.get_chr_lens(): if '_' in chrom: continue - if 'X' in chrom or 'Y' in chrom: + if 'X' in chrom or 'Y' in chrom or 'M' in chrom: continue self.sample_from_intervals.append( (chrom, @@ -214,7 +214,7 @@ def _partition_genome_by_chromosome(self): for (chrom, len_chrom) in self.reference_sequence.get_chr_lens(): if '_' in chrom: continue - if 'X' in chrom or 'Y' in chrom: + if 'X' in chrom or 'Y' in chrom or 'M' in chrom: continue if chrom in self.validation_holdout: self._sample_from_mode["validate"].indices.append( From 831b57a387f091b46fdb6eddf53cf4b95b16682d Mon Sep 17 00:00:00 2001 From: Kathy Date: Fri, 13 Jan 2023 16:56:52 -0500 Subject: [PATCH 10/62] add type specification to unpackbits --- selene_sdk/samplers/dataloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/selene_sdk/samplers/dataloader.py b/selene_sdk/samplers/dataloader.py index 07f29e38..b9a1edec 100644 --- a/selene_sdk/samplers/dataloader.py +++ b/selene_sdk/samplers/dataloader.py @@ -200,7 +200,7 @@ def __getitem__(self, index): sequence = self.sequences[index, :, :] targets = self.targets[index, :] if self.unpackbits: - sequence = np.unpackbits(sequence, axis=-2) + sequence = np.unpackbits(sequence.astype(np.uint8), axis=-2) nulls = np.sum(sequence, axis=-1) == sequence.shape[-1] sequence = sequence.astype(float) sequence[nulls, :] = 1.0 / sequence.shape[-1] From bc34a8850f19e3e02cede85c8d435ced9b7c8851 Mon Sep 17 00:00:00 2001 From: Kathy Date: Fri, 3 Mar 2023 14:36:08 -0500 Subject: [PATCH 11/62] change loss --- selene_sdk/train_methylation_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/selene_sdk/train_methylation_model.py b/selene_sdk/train_methylation_model.py index 097affef..43737a89 100644 --- a/selene_sdk/train_methylation_model.py +++ b/selene_sdk/train_methylation_model.py @@ -524,7 +524,7 @@ def train(self): self.optimizer.zero_grad() total_loss = cls_loss if torch.is_tensor(reg_loss): - total_loss += reg_loss + total_loss += 2*reg_loss total_loss.backward() self.optimizer.step() @@ -597,7 +597,7 @@ def _evaluate_on_data(self, data_in_batches): batch_bce.append(cls_loss.item()) total_loss = cls_loss if torch.is_tensor(reg_loss): - total_loss += reg_loss + total_loss += 2*reg_loss batch_mse.append(reg_loss.item()) batch_losses.append(total_loss.item()) all_predictions = torch.vstack(all_predictions) From 31f55ac13d57880bc98c0424b6ca5187341c0252 Mon Sep 17 00:00:00 2001 From: Kathy Date: Mon, 20 Mar 2023 13:24:25 -0400 Subject: [PATCH 12/62] experimenting w loss --- selene_sdk/train_methylation_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/selene_sdk/train_methylation_model.py b/selene_sdk/train_methylation_model.py index 43737a89..e794ad73 100644 --- a/selene_sdk/train_methylation_model.py +++ b/selene_sdk/train_methylation_model.py @@ -522,9 +522,9 @@ def train(self): # subset_pred, subset_tgt, reduction='none').nanmean(dim=1).mean() self.optimizer.zero_grad() - total_loss = cls_loss + total_loss = cls_loss #*0 if torch.is_tensor(reg_loss): - total_loss += 2*reg_loss + total_loss += reg_loss total_loss.backward() self.optimizer.step() @@ -595,9 +595,9 @@ def _evaluate_on_data(self, data_in_batches): all_ind_predictions.append(cls_pred) batch_bce.append(cls_loss.item()) - total_loss = cls_loss + total_loss = cls_loss #*0 if torch.is_tensor(reg_loss): - total_loss += 2*reg_loss + total_loss += reg_loss batch_mse.append(reg_loss.item()) batch_losses.append(total_loss.item()) all_predictions = torch.vstack(all_predictions) From b40b8cd69c14fd14e645d61cd31dd5933a4b83ac Mon Sep 17 00:00:00 2001 From: Kathy Date: Tue, 21 Mar 2023 11:31:30 -0400 Subject: [PATCH 13/62] add pearsonr --- selene_sdk/targets/genomic_features_h5.py | 5 ++++- selene_sdk/train_methylation_model.py | 18 ++++++++++-------- .../utils/methylation_performance_metrics.py | 6 ++++-- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/selene_sdk/targets/genomic_features_h5.py b/selene_sdk/targets/genomic_features_h5.py index 9f791010..fedef98c 100644 --- a/selene_sdk/targets/genomic_features_h5.py +++ b/selene_sdk/targets/genomic_features_h5.py @@ -253,4 +253,7 @@ def get_feature_data(self, chrom, start, end): row_targets.append(self.data[target]) if len(row_targets) == 0: # query was empty return False, nans - return True, np.average(np.vstack(row_targets), axis=0) + row_targets = np.vstack(row_targets) + if len(row_targets) == 1: + return True, row_targets[0] + return True, np.average(row_targets, axis=0) diff --git a/selene_sdk/train_methylation_model.py b/selene_sdk/train_methylation_model.py index e794ad73..5d341b10 100644 --- a/selene_sdk/train_methylation_model.py +++ b/selene_sdk/train_methylation_model.py @@ -14,6 +14,7 @@ import torch.nn as nn import torch.nn.functional as F from torch.optim.lr_scheduler import ReduceLROnPlateau +from scipy.stats import pearsonr from sklearn.metrics import average_precision_score from sklearn.metrics import roc_auc_score from sklearn.metrics import mean_squared_error @@ -274,7 +275,8 @@ def __init__(self, logger.debug("Set modules to use CUDA") self._report_gt_feature_n_positives = report_gt_feature_n_positives - self._metrics = dict(cls_average_precision=average_precision_score,) + self._metrics = dict(cls_average_precision=average_precision_score, + pearsonr=pearsonr) #reg_mse=mean_squared_error) self._n_validation_samples = n_validation_samples self._n_test_samples = n_test_samples @@ -629,16 +631,16 @@ def validate(self): cls_valid_scores = self._validation_metrics.update( val_pred_inds, self._all_validation_inds.reshape(len(val_pred_inds), 1), scores=['cls_average_precision']) - #reg_valid_scores = self._validation_metrics.update( - # val_pred_tgts[self._all_validation_inds == 1], - # self._all_validation_targets[self._all_validation_inds == 1], - # scores=['reg_mse']) + reg_valid_scores = self._validation_metrics.update( + val_pred_tgts[self._all_validation_inds == 1], + self._all_validation_targets[self._all_validation_inds == 1], + scores=['pearsonr']) for name, score in cls_valid_scores.items(): logger.info("validation {0}: {1}".format(name, score)) lossdict[name] = score - #for name, score in reg_valid_scores.items(): - # logger.info("validation {0}: {1}".format(name, score)) - # lossdict[name] = score + for name, score in reg_valid_scores.items(): + logger.info("validation {0}: {1}".format(name, score)) + lossdict[name] = score to_log = [lossdict['loss'], lossdict['mse'], lossdict['bce']] for k in sorted(self._validation_metrics.metrics.keys()): diff --git a/selene_sdk/utils/methylation_performance_metrics.py b/selene_sdk/utils/methylation_performance_metrics.py index fec5f5e3..876d8b43 100644 --- a/selene_sdk/utils/methylation_performance_metrics.py +++ b/selene_sdk/utils/methylation_performance_metrics.py @@ -79,8 +79,10 @@ def compute_score(prediction, target, metric_fn, if len(np.unique(feature_targets)) > 0 and \ np.count_nonzero(feature_targets) > report_gt_feature_n_positives: try: - feature_scores[index] = metric_fn( - feature_targets, feature_preds) + output = metric_fn(feature_targets, feature_preds) + if type(output) == tuple: + output = output[0] + feature_scores[index] = output except ValueError: # do I need to make this more generic? continue valid_feature_scores = [s for s in feature_scores if not np.isnan(s)] # Allow 0 or negative values. From 8b5c884206e54ddf3e3d3bd60be53a66a1edd251 Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 22 Mar 2023 09:58:41 -0400 Subject: [PATCH 14/62] fix metrix nan removal bug --- selene_sdk/utils/methylation_performance_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/selene_sdk/utils/methylation_performance_metrics.py b/selene_sdk/utils/methylation_performance_metrics.py index 876d8b43..4ed09886 100644 --- a/selene_sdk/utils/methylation_performance_metrics.py +++ b/selene_sdk/utils/methylation_performance_metrics.py @@ -74,8 +74,8 @@ def compute_score(prediction, target, metric_fn, prediction = prediction.T for index, feature_preds in enumerate(prediction): feature_targets = target[:, index] + feature_preds = feature_preds[~np.isnan(feature_targets)] feature_targets = feature_targets[~np.isnan(feature_targets)] - feature_preds = feature_preds[~np.isnan(feature_preds)] if len(np.unique(feature_targets)) > 0 and \ np.count_nonzero(feature_targets) > report_gt_feature_n_positives: try: From 54ddea1cb1c3ee4a08dc03bcbdb89afde12c0002 Mon Sep 17 00:00:00 2001 From: Kathy Date: Mon, 3 Apr 2023 11:31:05 -0400 Subject: [PATCH 15/62] changes to sampler for positives-only hdf5 --- selene_sdk/samplers/dataloader.py | 40 +++++++++++++++++++++------- selene_sdk/samplers/multi_sampler.py | 40 +++++++++++++++++----------- 2 files changed, 56 insertions(+), 24 deletions(-) diff --git a/selene_sdk/samplers/dataloader.py b/selene_sdk/samplers/dataloader.py index b9a1edec..20a18d7e 100644 --- a/selene_sdk/samplers/dataloader.py +++ b/selene_sdk/samplers/dataloader.py @@ -3,6 +3,7 @@ which allow parallel sampling for any Sampler using torch DataLoader mechanism. """ +import random import sys import h5py @@ -162,7 +163,8 @@ def __init__(self, in_memory=False, unpackbits=False, sequence_key="sequences", - targets_key="targets"): + targets_key="targets", + indicators_key=False): super(_H5Dataset, self).__init__() self.file_path = file_path self.in_memory = in_memory @@ -171,6 +173,7 @@ def __init__(self, self._initialized = False self._sequence_key = sequence_key self._targets_key = targets_key + self._indicators_key = indicators_key def init(func): # delay initialization to allow multiprocessing @@ -178,17 +181,24 @@ def init(func): def dfunc(self, *args, **kwargs): if not self._initialized: self.db = h5py.File(self.file_path, 'r') + key = 'indicator' + if key not in self.db and self._indicators_key: + key = 'indicators' if self.unpackbits: self.s_len = self.db['{0}_length'.format(self._sequence_key)][()] #self.t_len = self.db['{0}_length'.format(self._targets_key)][()] if self.in_memory: self.sequences = np.asarray(self.db[self._sequence_key]) self.targets = np.asarray(self.db[self._targets_key]) - self.indicators = np.asarray(self.db['indicators']) + self.indicators = None + if self._indicators_key: + self.indicators = np.asarray(self.db[key]) else: self.sequences = self.db[self._sequence_key] self.targets = self.db[self._targets_key] - self.indicators = self.db['indicators'] + self.indicators = None + if self._indicators_key: + self.indicators = self.db[key] self._initialized = True return func(self, *args, **kwargs) return dfunc @@ -214,9 +224,14 @@ def __getitem__(self, index): # targets = targets[:, :self.t_len] #else: # targets = targets[:self.t_len] - return (torch.from_numpy(sequence.astype(np.float32)), - torch.from_numpy(targets.astype(np.float32)), - self.indicators[index]) + if self.indicators is not None: + return (torch.from_numpy(sequence.astype(np.float32)), + torch.from_numpy(targets.astype(np.float32)), + self.indicators[index]) + else: + return (torch.from_numpy(sequence.astype(np.float32)), + torch.from_numpy(targets.astype(np.float32)),) + @init def __len__(self): @@ -299,16 +314,23 @@ def __init__(self, sampler=None, batch_sampler=None, shuffle=True): + g = torch.Generator() + g.manual_seed(seed) + def worker_init_fn(worker_id): - np.random.seed(seed + worker_id) + worker_seed = torch.initial_seed() % 2**32 + print("Worker seed", worker_seed) + np.random.seed(worker_seed) + random.seed(worker_seed) + torch.manual_seed(worker_seed) args = { "batch_size": batch_size, - #"num_workers": 0 if in_memory else num_workers, "pin_memory": True, "worker_init_fn": worker_init_fn, "sampler": sampler, - "batch_sampler": batch_sampler + "batch_sampler": batch_sampler, + "generator": g, } if hasattr(dataset, 'in_memory'): diff --git a/selene_sdk/samplers/multi_sampler.py b/selene_sdk/samplers/multi_sampler.py index c5f0ecba..36296187 100644 --- a/selene_sdk/samplers/multi_sampler.py +++ b/selene_sdk/samplers/multi_sampler.py @@ -216,13 +216,17 @@ def sample(self, batch_size=1, mode=None): else: self._set_batch_size(batch_size, mode=mode) try: - data, targets, ind = next(self._iterators[mode]) - return data.numpy(), targets.numpy(), ind.numpy() + data, targets = next(self._iterators[mode]) + return data.numpy(), targets.numpy() + #data, targets, ind = next(self._iterators[mode]) + #return data.numpy(), targets.numpy(), ind.numpy() except StopIteration: #If DataLoader iterator reaches its length, reinitialize self._iterators[mode] = iter(self._dataloaders[mode]) - data, targets, ind = next(self._iterators[mode]) - return data.numpy(), targets.numpy(), ind.numpy() + #data, targets, ind = next(self._iterators[mode]) + #return data.numpy(), targets.numpy(), ind.numpy() + data, targets = next(self._iterators[mode]) + return data.numpy(), targets.numpy() def get_data_and_targets(self, batch_size, n_samples=None, mode=None): """ @@ -260,22 +264,28 @@ def get_data_and_targets(self, batch_size, n_samples=None, mode=None): self._set_batch_size(batch_size, mode=mode) data_and_targets = [] targets_mat = [] - ind_mat = [] + #ind_mat = [] count = batch_size while count < n_samples: - data, tgts, ind = self.sample(batch_size=batch_size, mode=mode) - data_and_targets.append((data, tgts, ind)) - targets_mat.append(tgts) - ind_mat.append(ind) + #data, tgts, ind = self.sample(batch_size=batch_size, mode=mode) + #data_and_targets.append((data, tgts, ind)) + output = self.sample(batch_size=batch_size, mode=mode) + data_and_targets.append(output) + targets_mat.append(output[1]) + #ind_mat.append(ind) count += batch_size remainder = batch_size - (count - n_samples) - data, tgts, ind = self.sample(batch_size=remainder) - data_and_targets.append((data, tgts, ind)) - targets_mat.append(tgts) - ind_mat.append(ind) + #data, tgts, ind = self.sample(batch_size=remainder) + #data_and_targets.append((data, tgts, ind)) + #targets_mat.append(tgts) + #ind_mat.append(ind) + output = self.sample(batch_size=remainder) + data_and_targets.append(output) + targets_mat.append(output[1]) targets_mat = np.vstack(targets_mat) - ind_mat = np.hstack(ind_mat) - return data_and_targets, targets_mat, ind_mat + #ind_mat = np.hstack(ind_mat) + #return data_and_targets, targets_mat, ind_mat + return data_and_targets, targets_mat def get_validation_set(self, batch_size, n_samples=None): """ From 3bcd0980401ddc3990f03cc469895952f7208eeb Mon Sep 17 00:00:00 2001 From: Kathy Date: Mon, 3 Apr 2023 11:31:59 -0400 Subject: [PATCH 16/62] loss weighting --- selene_sdk/train_methylation_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/selene_sdk/train_methylation_model.py b/selene_sdk/train_methylation_model.py index 5d341b10..31198e2e 100644 --- a/selene_sdk/train_methylation_model.py +++ b/selene_sdk/train_methylation_model.py @@ -524,7 +524,7 @@ def train(self): # subset_pred, subset_tgt, reduction='none').nanmean(dim=1).mean() self.optimizer.zero_grad() - total_loss = cls_loss #*0 + total_loss = cls_loss #* 0.01 if torch.is_tensor(reg_loss): total_loss += reg_loss total_loss.backward() @@ -597,7 +597,7 @@ def _evaluate_on_data(self, data_in_batches): all_ind_predictions.append(cls_pred) batch_bce.append(cls_loss.item()) - total_loss = cls_loss #*0 + total_loss = cls_loss #* 0.01 if torch.is_tensor(reg_loss): total_loss += reg_loss batch_mse.append(reg_loss.item()) From 9e2796959619d64a1ddfa20c94c734e9b8fdb5b5 Mon Sep 17 00:00:00 2001 From: Kathy Date: Mon, 3 Apr 2023 12:33:21 -0400 Subject: [PATCH 17/62] adjust methylation perf metric output checking --- selene_sdk/utils/methylation_performance_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/selene_sdk/utils/methylation_performance_metrics.py b/selene_sdk/utils/methylation_performance_metrics.py index 4ed09886..353f8262 100644 --- a/selene_sdk/utils/methylation_performance_metrics.py +++ b/selene_sdk/utils/methylation_performance_metrics.py @@ -80,7 +80,7 @@ def compute_score(prediction, target, metric_fn, np.count_nonzero(feature_targets) > report_gt_feature_n_positives: try: output = metric_fn(feature_targets, feature_preds) - if type(output) == tuple: + if type(output) != float: output = output[0] feature_scores[index] = output except ValueError: # do I need to make this more generic? From 462b7296ee5ae7206ce1676168d2864220bdb57d Mon Sep 17 00:00:00 2001 From: Kathy Date: Thu, 6 Apr 2023 12:02:19 -0400 Subject: [PATCH 18/62] revert multisampler and add spearmanr to training log --- selene_sdk/samplers/multi_sampler.py | 30 +++++++++++++++------------ selene_sdk/train_methylation_model.py | 5 +++-- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/selene_sdk/samplers/multi_sampler.py b/selene_sdk/samplers/multi_sampler.py index 36296187..0796c42b 100644 --- a/selene_sdk/samplers/multi_sampler.py +++ b/selene_sdk/samplers/multi_sampler.py @@ -216,17 +216,18 @@ def sample(self, batch_size=1, mode=None): else: self._set_batch_size(batch_size, mode=mode) try: - data, targets = next(self._iterators[mode]) - return data.numpy(), targets.numpy() - #data, targets, ind = next(self._iterators[mode]) - #return data.numpy(), targets.numpy(), ind.numpy() + #data, targets = next(self._iterators[mode]) + #return data.numpy(), targets.numpy() + data, targets, ind = next(self._iterators[mode]) + return data.numpy(), targets.numpy(), ind.numpy() except StopIteration: #If DataLoader iterator reaches its length, reinitialize self._iterators[mode] = iter(self._dataloaders[mode]) - #data, targets, ind = next(self._iterators[mode]) - #return data.numpy(), targets.numpy(), ind.numpy() - data, targets = next(self._iterators[mode]) - return data.numpy(), targets.numpy() + + #data, targets = next(self._iterators[mode]) + #return data.numpy(), targets.numpy() + data, targets, ind = next(self._iterators[mode]) + return data.numpy(), targets.numpy(), ind.numpy() def get_data_and_targets(self, batch_size, n_samples=None, mode=None): """ @@ -264,15 +265,16 @@ def get_data_and_targets(self, batch_size, n_samples=None, mode=None): self._set_batch_size(batch_size, mode=mode) data_and_targets = [] targets_mat = [] - #ind_mat = [] + ind_mat = [] count = batch_size while count < n_samples: #data, tgts, ind = self.sample(batch_size=batch_size, mode=mode) #data_and_targets.append((data, tgts, ind)) + output = self.sample(batch_size=batch_size, mode=mode) data_and_targets.append(output) targets_mat.append(output[1]) - #ind_mat.append(ind) + ind_mat.append(output[2]) count += batch_size remainder = batch_size - (count - n_samples) #data, tgts, ind = self.sample(batch_size=remainder) @@ -283,9 +285,11 @@ def get_data_and_targets(self, batch_size, n_samples=None, mode=None): data_and_targets.append(output) targets_mat.append(output[1]) targets_mat = np.vstack(targets_mat) - #ind_mat = np.hstack(ind_mat) - #return data_and_targets, targets_mat, ind_mat - return data_and_targets, targets_mat + + ind_mat.append(output[2]) + ind_mat = np.hstack(ind_mat) + return data_and_targets, targets_mat, ind_mat + #return data_and_targets, targets_mat def get_validation_set(self, batch_size, n_samples=None): """ diff --git a/selene_sdk/train_methylation_model.py b/selene_sdk/train_methylation_model.py index 31198e2e..efde31b4 100644 --- a/selene_sdk/train_methylation_model.py +++ b/selene_sdk/train_methylation_model.py @@ -15,6 +15,7 @@ import torch.nn.functional as F from torch.optim.lr_scheduler import ReduceLROnPlateau from scipy.stats import pearsonr +from scipy.stats import spearmanr from sklearn.metrics import average_precision_score from sklearn.metrics import roc_auc_score from sklearn.metrics import mean_squared_error @@ -276,7 +277,7 @@ def __init__(self, self._report_gt_feature_n_positives = report_gt_feature_n_positives self._metrics = dict(cls_average_precision=average_precision_score, - pearsonr=pearsonr) + pearsonr=pearsonr, spearmanr=spearmanr) #reg_mse=mean_squared_error) self._n_validation_samples = n_validation_samples self._n_test_samples = n_test_samples @@ -634,7 +635,7 @@ def validate(self): reg_valid_scores = self._validation_metrics.update( val_pred_tgts[self._all_validation_inds == 1], self._all_validation_targets[self._all_validation_inds == 1], - scores=['pearsonr']) + scores=['pearsonr', 'spearmanr']) for name, score in cls_valid_scores.items(): logger.info("validation {0}: {1}".format(name, score)) lossdict[name] = score From d0ae0916f4b8556d38ad47ec9cf347fcb439ad60 Mon Sep 17 00:00:00 2001 From: Kathy Date: Fri, 14 Apr 2023 09:15:13 -0400 Subject: [PATCH 19/62] explicit metric fn checking, refine later --- selene_sdk/utils/methylation_performance_metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/selene_sdk/utils/methylation_performance_metrics.py b/selene_sdk/utils/methylation_performance_metrics.py index 353f8262..0142fee1 100644 --- a/selene_sdk/utils/methylation_performance_metrics.py +++ b/selene_sdk/utils/methylation_performance_metrics.py @@ -80,7 +80,7 @@ def compute_score(prediction, target, metric_fn, np.count_nonzero(feature_targets) > report_gt_feature_n_positives: try: output = metric_fn(feature_targets, feature_preds) - if type(output) != float: + if type(output) != float and (metric_fn.__name__ == 'spearmanr' or metric_fn.__name__ == 'pearsonr'): output = output[0] feature_scores[index] = output except ValueError: # do I need to make this more generic? From 1fd2d96a6d9bfbf7c906983444d580648a02326b Mon Sep 17 00:00:00 2001 From: Kathy Date: Fri, 7 Jul 2023 16:13:16 -0400 Subject: [PATCH 20/62] minor adjustment to dataloader --- selene_sdk/samplers/dataloader.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/selene_sdk/samplers/dataloader.py b/selene_sdk/samplers/dataloader.py index 20a18d7e..65e0d7c0 100644 --- a/selene_sdk/samplers/dataloader.py +++ b/selene_sdk/samplers/dataloader.py @@ -208,7 +208,7 @@ def __getitem__(self, index): if isinstance(index, int): index = index % self.sequences.shape[0] sequence = self.sequences[index, :, :] - targets = self.targets[index, :] + targets = self.targets[index] if self.unpackbits: sequence = np.unpackbits(sequence.astype(np.uint8), axis=-2) nulls = np.sum(sequence, axis=-1) == sequence.shape[-1] @@ -224,6 +224,9 @@ def __getitem__(self, index): # targets = targets[:, :self.t_len] #else: # targets = targets[:self.t_len] + #if np.random.randint(2) == 1: # ONLY for unet + # sequence = np.flip(sequence, axis=-1) + # targets = np.flip(targets, axis=-1) if self.indicators is not None: return (torch.from_numpy(sequence.astype(np.float32)), torch.from_numpy(targets.astype(np.float32)), From 6f75a45ba0d9cb1a0c4d47f9c43b5a62c9a6aa31 Mon Sep 17 00:00:00 2001 From: Kathy Date: Mon, 10 Jul 2023 14:14:33 -0400 Subject: [PATCH 21/62] non strand specific temp changes --- selene_sdk/utils/non_strand_specific_module.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/selene_sdk/utils/non_strand_specific_module.py b/selene_sdk/utils/non_strand_specific_module.py index 7f4b5163..893b52d0 100644 --- a/selene_sdk/utils/non_strand_specific_module.py +++ b/selene_sdk/utils/non_strand_specific_module.py @@ -71,11 +71,15 @@ def forward(self, input): #output_from_rev1, output_from_rev2 = self.model.forward(reverse_input) output = self.model.forward(input) output_from_rev = self.model.forward(reverse_input) - - if self.mode == "mean": - return (output + output_from_rev) / 2 + if type(output) == tuple: + output1, output2 = output + output_from_rev1, output_from_rev2 = output_from_rev + return ((output1 + output_from_rev1) / 2, (output2 + output_from_rev2) / 2) else: - return torch.max(output, output_from_rev) + if self.mode == "mean": + return (output + output_from_rev) / 2 + else: + return torch.max(output, output_from_rev) #if self.mode == "mean": # return ((output1 + output_from_rev1) / 2, (output2 + output_from_rev2) / 2) #else: From 8ac4db6362c90e23b5302e572aa25949b5867728 Mon Sep 17 00:00:00 2001 From: Kathy Date: Fri, 14 Jul 2023 09:19:14 -0400 Subject: [PATCH 22/62] positives only sampler --- selene_sdk/samplers/multi_sampler.py | 32 ++++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/selene_sdk/samplers/multi_sampler.py b/selene_sdk/samplers/multi_sampler.py index 0796c42b..1cb6d751 100644 --- a/selene_sdk/samplers/multi_sampler.py +++ b/selene_sdk/samplers/multi_sampler.py @@ -216,18 +216,18 @@ def sample(self, batch_size=1, mode=None): else: self._set_batch_size(batch_size, mode=mode) try: - #data, targets = next(self._iterators[mode]) - #return data.numpy(), targets.numpy() - data, targets, ind = next(self._iterators[mode]) - return data.numpy(), targets.numpy(), ind.numpy() + data, targets = next(self._iterators[mode]) + return data.numpy(), targets.numpy() + #data, targets, ind = next(self._iterators[mode]) + #return data.numpy(), targets.numpy(), ind.numpy() except StopIteration: #If DataLoader iterator reaches its length, reinitialize self._iterators[mode] = iter(self._dataloaders[mode]) - #data, targets = next(self._iterators[mode]) - #return data.numpy(), targets.numpy() - data, targets, ind = next(self._iterators[mode]) - return data.numpy(), targets.numpy(), ind.numpy() + data, targets = next(self._iterators[mode]) + return data.numpy(), targets.numpy() + #data, targets, ind = next(self._iterators[mode]) + #return data.numpy(), targets.numpy(), ind.numpy() def get_data_and_targets(self, batch_size, n_samples=None, mode=None): """ @@ -265,7 +265,7 @@ def get_data_and_targets(self, batch_size, n_samples=None, mode=None): self._set_batch_size(batch_size, mode=mode) data_and_targets = [] targets_mat = [] - ind_mat = [] + #ind_mat = [] count = batch_size while count < n_samples: #data, tgts, ind = self.sample(batch_size=batch_size, mode=mode) @@ -274,22 +274,22 @@ def get_data_and_targets(self, batch_size, n_samples=None, mode=None): output = self.sample(batch_size=batch_size, mode=mode) data_and_targets.append(output) targets_mat.append(output[1]) - ind_mat.append(output[2]) + #ind_mat.append(output[2]) count += batch_size remainder = batch_size - (count - n_samples) - #data, tgts, ind = self.sample(batch_size=remainder) + #data, tgts, ind = self.sample(batch_size=remainder, mode=mode) #data_and_targets.append((data, tgts, ind)) #targets_mat.append(tgts) #ind_mat.append(ind) - output = self.sample(batch_size=remainder) + output = self.sample(batch_size=remainder, mode=mode) data_and_targets.append(output) targets_mat.append(output[1]) targets_mat = np.vstack(targets_mat) - ind_mat.append(output[2]) - ind_mat = np.hstack(ind_mat) - return data_and_targets, targets_mat, ind_mat - #return data_and_targets, targets_mat + #ind_mat.append(output[2]) + #ind_mat = np.hstack(ind_mat) + #return data_and_targets, targets_mat, ind_mat + return data_and_targets, targets_mat def get_validation_set(self, batch_size, n_samples=None): """ From 58a868be37348b5a3b5fea6212d222f96dc901f8 Mon Sep 17 00:00:00 2001 From: Kathy Date: Tue, 8 Aug 2023 12:57:46 -0400 Subject: [PATCH 23/62] revamp dataloader, seq length flexibility --- selene_sdk/samplers/dataloader.py | 77 +++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 18 deletions(-) diff --git a/selene_sdk/samplers/dataloader.py b/selene_sdk/samplers/dataloader.py index 65e0d7c0..2b2bd7f2 100644 --- a/selene_sdk/samplers/dataloader.py +++ b/selene_sdk/samplers/dataloader.py @@ -126,6 +126,27 @@ def worker_init_fn(worker_id): self.seed = seed +def unpackbits_sequence(sequence, s_len): + sequence = np.unpackbits(sequence.astype(np.uint8), axis=-2) + nulls = np.sum(sequence, axis=-1) == sequence.shape[-1] + sequence = sequence.astype(float) + sequence[nulls, :] = 1.0 / sequence.shape[-1] + if sequence.ndim == 3: + sequence = sequence[:, :s_len, :] + else: + sequence = sequence[:s_len, :] + return sequence + + +def unpackbits_targets(targets, t_len): + targets = np.unpackbits(targets, axis=-1).astype(float) + if targets.ndim == 2: + targets = targets[:, :t_len] + else: + targets = targets[:self.t_len] + return targets + + class _H5Dataset(Dataset): """ This class provides a Dataset that directly loads sequences and targets @@ -161,14 +182,23 @@ class _H5Dataset(Dataset): def __init__(self, file_path, in_memory=False, - unpackbits=False, + unpackbits=False, # implies unpackbits for both + unpackbits_seq=False, + unpackbits_tgt=False, sequence_key="sequences", targets_key="targets", - indicators_key=False): + indicators_key=False, + use_seq_len=None): super(_H5Dataset, self).__init__() self.file_path = file_path self.in_memory = in_memory + self.unpackbits = unpackbits + self.unpackbits_seq = unpackbits_seq + self.unpackbits_tgt = unpackbits_tgt + + self.use_seq_len = use_seq_len + self._seq_start, self._seq_end = None, None self._initialized = False self._sequence_key = sequence_key @@ -184,9 +214,15 @@ def dfunc(self, *args, **kwargs): key = 'indicator' if key not in self.db and self._indicators_key: key = 'indicators' + if self.unpackbits: self.s_len = self.db['{0}_length'.format(self._sequence_key)][()] - #self.t_len = self.db['{0}_length'.format(self._targets_key)][()] + self.t_len = self.db['{0}_length'.format(self._targets_key)][()] + elif self.unpackbits_seq: + self.s_len = self.db['{0}_length'.format(self._sequence_key)][()] + elif self.unpackbits_tgt: + self.t_len = self.db['{0}_length'.format(self._targets_key)][()] + if self.in_memory: self.sequences = np.asarray(self.db[self._sequence_key]) self.targets = np.asarray(self.db[self._targets_key]) @@ -207,23 +243,27 @@ def dfunc(self, *args, **kwargs): def __getitem__(self, index): if isinstance(index, int): index = index % self.sequences.shape[0] - sequence = self.sequences[index, :, :] + sequence = self.sequences[index] targets = self.targets[index] + if self.unpackbits: - sequence = np.unpackbits(sequence.astype(np.uint8), axis=-2) - nulls = np.sum(sequence, axis=-1) == sequence.shape[-1] - sequence = sequence.astype(float) - sequence[nulls, :] = 1.0 / sequence.shape[-1] - #targets = np.unpackbits( - # targets, axis=-1).astype(float) - if sequence.ndim == 3: - sequence = sequence[:, :self.s_len, :] - else: - sequence = sequence[:self.s_len, :] - #if targets.ndim == 2: - # targets = targets[:, :self.t_len] - #else: - # targets = targets[:self.t_len] + sequence = unpackbits_sequence(sequence, self.s_len) + targets = unpackbits_targets(targets, self.t_len) + elif self.unpackbits_seq: + sequence = unpackbits_sequence(sequence, self.s_len) + elif self.unpackbits_tgt: + targets = unpackbits_targets(targets, self.t_len) + + if self._seq_start is None: + self._seq_start = 0 + self._seq_end = len(sequence) + + if self.use_seq_len is not None: + mid = (self._seq_end - self._seq_start) // 2 + self._seq_start = mid - self.use_seq_len // 2 + self._seq_end = int(mid + np.ceil(self.use_seq_len / 2)) + sequence = sequence[self._seq_start:self._seq_end] + #if np.random.randint(2) == 1: # ONLY for unet # sequence = np.flip(sequence, axis=-1) # targets = np.flip(targets, axis=-1) @@ -351,3 +391,4 @@ def worker_init_fn(worker_id): else: args["shuffle"] = shuffle super(H5DataLoader, self).__init__(dataset, **args) + From 249f302cfc476e06329288929ea8e1a7944d746c Mon Sep 17 00:00:00 2001 From: Kathy Date: Tue, 8 Aug 2023 12:58:15 -0400 Subject: [PATCH 24/62] attempted a nonstrandspecific utils module, not being used --- .../utils/non_strand_specific_module.py | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/selene_sdk/utils/non_strand_specific_module.py b/selene_sdk/utils/non_strand_specific_module.py index 893b52d0..73466ef0 100644 --- a/selene_sdk/utils/non_strand_specific_module.py +++ b/selene_sdk/utils/non_strand_specific_module.py @@ -86,3 +86,56 @@ def forward(self, input): # return torch.max(output1, output_from_rev1), torch.max(output2, output_from_rev2) + + +class NonStrandSpecificInteract(Module): + """ + A torch.nn.Module that wraps a user-specified model architecture if the + architecture does not need to account for sequence strand-specificity. + + Parameters + ---------- + model : torch.nn.Module + The user-specified model architecture. + mode : {'mean', 'max'}, optional + Default is 'mean'. NonStrandSpecific will pass the input and the + reverse-complement of the input into `model`. The mode specifies + whether we should output the mean or max of the predictions as + the non-strand specific prediction. + + Attributes + ---------- + model : torch.nn.Module + The user-specified model architecture. + mode : {'mean', 'max'} + How to handle outputting a non-strand specific prediction. + + """ + + def __init__(self, model, mode="mean"): + super(NonStrandSpecificInteract, self).__init__() + + self.model = model + + if mode != "mean" and mode != "max": + raise ValueError("Mode should be one of 'mean' or 'max' but was" + "{0}.".format(mode)) + self.mode = mode + + def forward(self, input): + reverse_input = 5 - input + reverse_input[reverse_input == 5] = 0 + + output = self.model.forward(input)[0] + output_from_rev = self.model.forward(reverse_input)[0] + if type(output) == tuple: + output1, output2 = output + output_from_rev1, output_from_rev2 = output_from_rev + return ((output1 + output_from_rev1) / 2, (output2 + output_from_rev2) / 2) + else: + if self.mode == "mean": + return (output + output_from_rev) / 2 + else: + return torch.max(output, output_from_rev) + + From cb55bbca34844d795c675968e14c025a37ff20a9 Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 16 Aug 2023 09:31:01 -0400 Subject: [PATCH 25/62] evaluation classes --- selene_sdk/evaluate_methylation_interact.py | 295 ++++++++++++++++++++ selene_sdk/evaluate_methylation_only.py | 292 +++++++++++++++++++ 2 files changed, 587 insertions(+) create mode 100644 selene_sdk/evaluate_methylation_interact.py create mode 100644 selene_sdk/evaluate_methylation_only.py diff --git a/selene_sdk/evaluate_methylation_interact.py b/selene_sdk/evaluate_methylation_interact.py new file mode 100644 index 00000000..60722326 --- /dev/null +++ b/selene_sdk/evaluate_methylation_interact.py @@ -0,0 +1,295 @@ +""" +This module provides the EvaluateModel class. +""" +import logging +import os +import warnings + +import h5py +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from scipy.stats import pearsonr, spearmanr +from sklearn.metrics import average_precision_score, roc_auc_score +from sklearn.metrics import mean_squared_error + +from .sequences import Genome +from .utils import (PerformanceMetrics, _is_lua_trained_model, + initialize_logger, load_model_from_state_dict) +from .utils.methylation_performance_metrics import MethylationPerformanceMetrics + +logger = logging.getLogger("selene") + + +class EvaluateMethylationOnly(object): + """ + Evaluate model on a test set of sequences with known targets. + + Parameters + ---------- + model : torch.nn.Module + The model architecture. + criterion : torch.nn._Loss + The loss function that was optimized during training. + data_sampler : selene_sdk.samplers.Sampler + Used to retrieve samples from the test set for evaluation. + features : list(str) + List of distinct features the model predicts. + trained_model_path : str + Path to the trained model file, saved using `torch.save`. + output_dir : str + The output directory in which to save model evaluation and logs. + batch_size : int, optional + Default is 64. Specify the batch size to process examples. + Should be a power of 2. + n_test_samples : int or None, optional + Default is `None`. Use `n_test_samples` if you want to limit the + number of samples on which you evaluate your model. If you are + using a sampler of type `selene_sdk.samplers.OnlineSampler`, + by default it will draw 640000 samples if `n_test_samples` is `None`. + report_gt_feature_n_positives : int, optional + Default is 10. In the final test set, each class/feature must have + more than `report_gt_feature_n_positives` positive samples in order to + be considered in the test performance computation. The output file that + states each class' performance will report 'NA' for classes that do + not have enough positive samples. + use_cuda : bool, optional + Default is `False`. Specify whether a CUDA-enabled GPU is available + for torch to use during training. + data_parallel : bool, optional + Default is `False`. Specify whether multiple GPUs are available + for torch to use during training. + use_features_ord : list(str) or None, optional + Default is None. Specify an ordered list of features for which to + run the evaluation. The features in this list must be identical to or + a subset of `features`, and in the order you want the resulting + `test_targets.npz` and `test_predictions.npz` to be saved. If using + a FileSampler or H5DataLoader for the evaluation, you can pass in + a dataset with the targets matrix only containing these features, but + note that this subsetted targets matrix MUST be ordered the same + way as `features`, and the predictions and targets .npz output + will be reordered according to `use_features_ord`. + + Attributes + ---------- + model : torch.nn.Module + The trained model. + criterion : torch.nn._Loss + The model was trained using this loss function. + sampler : selene_sdk.samplers.Sampler + The example generator. + features : list(str) + List of distinct features the model predicts. + batch_size : int + The batch size to process examples. Should be a power of 2. + use_cuda : bool + If `True`, use a CUDA-enabled GPU. If `False`, use the CPU. + data_parallel : bool + Whether to use multiple GPUs or not. + metrics : dict + A dictionary that maps metric names (`str`) to metric functions. + By default, this contains `"roc_auc"`, which maps to + `sklearn.metrics.roc_auc_score`, and `"average_precision"`, + which maps to `sklearn.metrics.average_precision_score`. + + """ + + def __init__(self, + model, + data_sampler, + features, + trained_model_path, + output_dir, + batch_size=64, + n_test_samples=None, + report_gt_feature_n_positives=10, + use_cuda=False, + data_parallel=False, + use_features_ord=None, + metrics=dict(roc_auc=roc_auc_score, + average_precision=average_precision_score)): + self.reg_criterion = F.mse_loss + + trained_model = torch.load( + trained_model_path, map_location=lambda storage, location: storage) + if "state_dict" in trained_model: + self.model = load_model_from_state_dict( + trained_model["state_dict"], model) + else: + self.model = load_model_from_state_dict( + trained_model, model) + self.model.eval() + + self.sampler = data_sampler + + self.output_dir = output_dir + os.makedirs(output_dir, exist_ok=True) + + self.features = np.array(features) + self._use_ixs = list(range(len(features))) + if use_features_ord is not None: + feature_ixs = {f: ix for (ix, f) in enumerate(features)} + self._use_ixs = [] + for f in use_features_ord: + if f in feature_ixs: + self._use_ixs.append(feature_ixs[f]) + else: + warnings.warn(("Feature {0} in `use_features_ord` " + "does not match any features in the list " + "`features` and will be skipped.").format(f)) + self._write_features_ordered_to_file() + + initialize_logger( + os.path.join(self.output_dir, "{0}.log".format( + __name__)), + verbosity=2) + + self.data_parallel = data_parallel + if self.data_parallel: + self.model = nn.DataParallel(model) + logger.debug("Wrapped model in DataParallel") + + self.use_cuda = use_cuda + if self.use_cuda: + self.model.cuda() + + self.batch_size = batch_size + + metrics = dict(spearmanr=spearmanr, pearsonr=pearsonr) + self._metrics = MethylationPerformanceMetrics( + self._get_feature_from_index, + report_gt_feature_n_positives=report_gt_feature_n_positives, + metrics=metrics) + + self.n_test_samples = n_test_samples + self._test_data, self._all_test_targets = \ + self.sampler.get_data_and_targets(self.batch_size, n_test_samples) + + self._use_testmat_ixs = self._use_ixs[:] + # if the targets shape is the same as the subsetted features, + # reindex based on the subsetted list + if self._all_test_targets.shape[1] == len(self._use_ixs): + subset_features = {self.features[ix]: i for (i, ix) in + enumerate(sorted(self._use_ixs))} + self._use_testmat_ixs = [ + subset_features[f] for f in self.features[self._use_ixs]] + + #self._all_test_targets = self._all_test_targets[ + # :, self._use_testmat_ixs] + + # save the targets dataset now + with h5py.File(os.path.join(self.output_dir, 'test_dataset.N={0}.h5'.format( + self.n_test_samples)), 'a') as fh: + if 'targets' in fh: + del fh['targets'] + fh.create_dataset('targets', data=self._all_test_targets) + + # reset Genome base ordering when applicable. + if (hasattr(self.sampler, "reference_sequence") and + isinstance(self.sampler.reference_sequence, Genome)): + if _is_lua_trained_model(model): + Genome.update_bases_order(['A', 'G', 'C', 'T']) + else: + Genome.update_bases_order(['A', 'C', 'G', 'T']) + + def _write_features_ordered_to_file(self): + """ + Write the feature ordering specified by `use_features_ord` + after matching it with the `features` list from the class + initialization parameters. + """ + fp = os.path.join(self.output_dir, 'use_features_ord.txt') + with open(fp, 'w+') as file_handle: + for f in self.features: #[self._use_ixs]: + file_handle.write('{0}\n'.format(f)) + + def _get_feature_from_index(self, index): + """ + Gets the feature at an index in the features list. + + Parameters + ---------- + index : int + + Returns + ------- + str + The name of the feature/target at the specified index. + + """ + return self.features[index] #[self._use_ixs][index] + + def evaluate(self): + """ + Passes all samples retrieved from the sampler to the model in + batches and returns the predictions. Also reports the model's + performance on these examples. + + Returns + ------- + dict + A dictionary, where keys are the features and the values are + each a dict of the performance metrics (currently ROC AUC and + AUPR) reported for each feature the model predicts. + + """ + batch_losses = [] + sequences = [] + all_predictions = [] + for (inputs, targets) in self._test_data: + sequences.append(inputs) + inputs = torch.Tensor(inputs).int() + #inputs = torch.Tensor(inputs) + targets = torch.Tensor(targets) #[:, self._use_testmat_ixs]) + if self.use_cuda: + inputs = inputs.cuda() + targets = targets.cuda() + with torch.no_grad(): + predictions = None + if _is_lua_trained_model(self.model): + predictions = self.model.forward( + inputs.transpose(1, 2).contiguous().unsqueeze_(2)) + else: + predictions = self.model.forward( + inputs)[0] + #inputs.transpose(1, 2)) + #print(type(targets), type(predictions)) + #print(targets.size(), predictions.size()) + mask = torch.isnan(targets) + #print(mask) + reg_loss = self.reg_criterion( + predictions[~mask], targets[~mask]) + + all_predictions.append(predictions) + batch_losses.append(reg_loss.item()) + + all_predictions = torch.vstack(all_predictions) + all_predictions = all_predictions.data.cpu().numpy() + + reg_scores = self._metrics.update( + all_predictions, self._all_test_targets, scores=['pearsonr', 'spearmanr']) + for name, score in reg_scores.items(): + logger.info("test {0}: {1}".format(name, score)) + + np.savez_compressed( + os.path.join(self.output_dir, "test_predictions.N={0}.npz".format( + self.n_test_samples)), + data=all_predictions) + + with h5py.File(os.path.join(self.output_dir, 'test_dataset.N={0}.h5'.format( + self.n_test_samples)), 'a') as fh: + fh.create_dataset('sequences', data=np.vstack(sequences)) + + lossdict = { + 'loss': np.average(batch_losses) + } + for name, score in lossdict.items(): + logger.info("test {0}: {1}".format(name, score)) + + test_performance = os.path.join( + self.output_dir, "test_performance.txt") + feature_scores_dict = self._metrics.write_feature_scores_to_file( + test_performance) + + return feature_scores_dict diff --git a/selene_sdk/evaluate_methylation_only.py b/selene_sdk/evaluate_methylation_only.py new file mode 100644 index 00000000..edccfe7d --- /dev/null +++ b/selene_sdk/evaluate_methylation_only.py @@ -0,0 +1,292 @@ +""" +This module provides the EvaluateModel class. +""" +import logging +import os +import warnings + +import h5py +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from scipy.stats import pearsonr, spearmanr +from sklearn.metrics import average_precision_score, roc_auc_score +from sklearn.metrics import mean_squared_error + +from .sequences import Genome +from .utils import (PerformanceMetrics, _is_lua_trained_model, + initialize_logger, load_model_from_state_dict) +from .utils.methylation_performance_metrics import MethylationPerformanceMetrics + +logger = logging.getLogger("selene") + + +class EvaluateMethylationOnly(object): + """ + Evaluate model on a test set of sequences with known targets. + + Parameters + ---------- + model : torch.nn.Module + The model architecture. + criterion : torch.nn._Loss + The loss function that was optimized during training. + data_sampler : selene_sdk.samplers.Sampler + Used to retrieve samples from the test set for evaluation. + features : list(str) + List of distinct features the model predicts. + trained_model_path : str + Path to the trained model file, saved using `torch.save`. + output_dir : str + The output directory in which to save model evaluation and logs. + batch_size : int, optional + Default is 64. Specify the batch size to process examples. + Should be a power of 2. + n_test_samples : int or None, optional + Default is `None`. Use `n_test_samples` if you want to limit the + number of samples on which you evaluate your model. If you are + using a sampler of type `selene_sdk.samplers.OnlineSampler`, + by default it will draw 640000 samples if `n_test_samples` is `None`. + report_gt_feature_n_positives : int, optional + Default is 10. In the final test set, each class/feature must have + more than `report_gt_feature_n_positives` positive samples in order to + be considered in the test performance computation. The output file that + states each class' performance will report 'NA' for classes that do + not have enough positive samples. + use_cuda : bool, optional + Default is `False`. Specify whether a CUDA-enabled GPU is available + for torch to use during training. + data_parallel : bool, optional + Default is `False`. Specify whether multiple GPUs are available + for torch to use during training. + use_features_ord : list(str) or None, optional + Default is None. Specify an ordered list of features for which to + run the evaluation. The features in this list must be identical to or + a subset of `features`, and in the order you want the resulting + `test_targets.npz` and `test_predictions.npz` to be saved. If using + a FileSampler or H5DataLoader for the evaluation, you can pass in + a dataset with the targets matrix only containing these features, but + note that this subsetted targets matrix MUST be ordered the same + way as `features`, and the predictions and targets .npz output + will be reordered according to `use_features_ord`. + + Attributes + ---------- + model : torch.nn.Module + The trained model. + criterion : torch.nn._Loss + The model was trained using this loss function. + sampler : selene_sdk.samplers.Sampler + The example generator. + features : list(str) + List of distinct features the model predicts. + batch_size : int + The batch size to process examples. Should be a power of 2. + use_cuda : bool + If `True`, use a CUDA-enabled GPU. If `False`, use the CPU. + data_parallel : bool + Whether to use multiple GPUs or not. + metrics : dict + A dictionary that maps metric names (`str`) to metric functions. + By default, this contains `"roc_auc"`, which maps to + `sklearn.metrics.roc_auc_score`, and `"average_precision"`, + which maps to `sklearn.metrics.average_precision_score`. + + """ + + def __init__(self, + model, + data_sampler, + features, + trained_model_path, + output_dir, + batch_size=64, + n_test_samples=None, + report_gt_feature_n_positives=10, + use_cuda=False, + data_parallel=False, + use_features_ord=None, + metrics=dict(roc_auc=roc_auc_score, + average_precision=average_precision_score)): + self.reg_criterion = F.mse_loss + + trained_model = torch.load( + trained_model_path, map_location=lambda storage, location: storage) + if "state_dict" in trained_model: + self.model = load_model_from_state_dict( + trained_model["state_dict"], model) + else: + self.model = load_model_from_state_dict( + trained_model, model) + self.model.eval() + + self.sampler = data_sampler + + self.output_dir = output_dir + os.makedirs(output_dir, exist_ok=True) + + self.features = np.array(features) + self._use_ixs = list(range(len(features))) + if use_features_ord is not None: + feature_ixs = {f: ix for (ix, f) in enumerate(features)} + self._use_ixs = [] + for f in use_features_ord: + if f in feature_ixs: + self._use_ixs.append(feature_ixs[f]) + else: + warnings.warn(("Feature {0} in `use_features_ord` " + "does not match any features in the list " + "`features` and will be skipped.").format(f)) + self._write_features_ordered_to_file() + + initialize_logger( + os.path.join(self.output_dir, "{0}.log".format( + __name__)), + verbosity=2) + + self.data_parallel = data_parallel + if self.data_parallel: + self.model = nn.DataParallel(model) + logger.debug("Wrapped model in DataParallel") + + self.use_cuda = use_cuda + if self.use_cuda: + self.model.cuda() + + self.batch_size = batch_size + + metrics = dict(spearmanr=spearmanr, pearsonr=pearsonr) + self._metrics = MethylationPerformanceMetrics( + self._get_feature_from_index, + report_gt_feature_n_positives=report_gt_feature_n_positives, + metrics=metrics) + + self.n_test_samples = n_test_samples + self._test_data, self._all_test_targets = \ + self.sampler.get_data_and_targets(self.batch_size, n_test_samples) + + self._use_testmat_ixs = self._use_ixs[:] + # if the targets shape is the same as the subsetted features, + # reindex based on the subsetted list + if self._all_test_targets.shape[1] == len(self._use_ixs): + subset_features = {self.features[ix]: i for (i, ix) in + enumerate(sorted(self._use_ixs))} + self._use_testmat_ixs = [ + subset_features[f] for f in self.features[self._use_ixs]] + + #self._all_test_targets = self._all_test_targets[ + # :, self._use_testmat_ixs] + + # save the targets dataset now + with h5py.File(os.path.join(self.output_dir, 'test_dataset.N={0}.h5'.format( + self.n_test_samples)), 'a') as fh: + if 'targets' in fh: + del fh['targets'] + fh.create_dataset('targets', data=self._all_test_targets) + + # reset Genome base ordering when applicable. + if (hasattr(self.sampler, "reference_sequence") and + isinstance(self.sampler.reference_sequence, Genome)): + if _is_lua_trained_model(model): + Genome.update_bases_order(['A', 'G', 'C', 'T']) + else: + Genome.update_bases_order(['A', 'C', 'G', 'T']) + + def _write_features_ordered_to_file(self): + """ + Write the feature ordering specified by `use_features_ord` + after matching it with the `features` list from the class + initialization parameters. + """ + fp = os.path.join(self.output_dir, 'use_features_ord.txt') + with open(fp, 'w+') as file_handle: + for f in self.features: #[self._use_ixs]: + file_handle.write('{0}\n'.format(f)) + + def _get_feature_from_index(self, index): + """ + Gets the feature at an index in the features list. + + Parameters + ---------- + index : int + + Returns + ------- + str + The name of the feature/target at the specified index. + + """ + return self.features[index] #[self._use_ixs][index] + + def evaluate(self): + """ + Passes all samples retrieved from the sampler to the model in + batches and returns the predictions. Also reports the model's + performance on these examples. + + Returns + ------- + dict + A dictionary, where keys are the features and the values are + each a dict of the performance metrics (currently ROC AUC and + AUPR) reported for each feature the model predicts. + + """ + batch_losses = [] + sequences = [] + all_predictions = [] + for (inputs, targets) in self._test_data: + sequences.append(inputs) + #inputs = torch.Tensor(inputs).int() + inputs = torch.Tensor(inputs) + targets = torch.Tensor(targets) #[:, self._use_testmat_ixs]) + if self.use_cuda: + inputs = inputs.cuda() + targets = targets.cuda() + with torch.no_grad(): + predictions = None + if _is_lua_trained_model(self.model): + predictions = self.model.forward( + inputs.transpose(1, 2).contiguous().unsqueeze_(2)) + else: + predictions = self.model.forward( + #inputs)[0] + inputs.transpose(1, 2)) + mask = torch.isnan(targets) + reg_loss = self.reg_criterion( + predictions[~mask], targets[~mask]) + + all_predictions.append(predictions) + batch_losses.append(reg_loss.item()) + + all_predictions = torch.vstack(all_predictions) + all_predictions = all_predictions.data.cpu().numpy() + + reg_scores = self._metrics.update( + all_predictions, self._all_test_targets, scores=['pearsonr', 'spearmanr']) + for name, score in reg_scores.items(): + logger.info("test {0}: {1}".format(name, score)) + + np.savez_compressed( + os.path.join(self.output_dir, "test_predictions.N={0}.npz".format( + self.n_test_samples)), + data=all_predictions) + + with h5py.File(os.path.join(self.output_dir, 'test_dataset.N={0}.h5'.format( + self.n_test_samples)), 'a') as fh: + fh.create_dataset('sequences', data=np.vstack(sequences)) + + lossdict = { + 'loss': np.average(batch_losses) + } + for name, score in lossdict.items(): + logger.info("test {0}: {1}".format(name, score)) + + test_performance = os.path.join( + self.output_dir, "test_performance.txt") + feature_scores_dict = self._metrics.write_feature_scores_to_file( + test_performance) + + return feature_scores_dict From c761f36c1a23faba5d8e2ec5456c5ce57c695689 Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 13 Sep 2023 09:38:37 -0400 Subject: [PATCH 26/62] revert non strand specific module --- selene_sdk/utils/non_strand_specific_module.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/selene_sdk/utils/non_strand_specific_module.py b/selene_sdk/utils/non_strand_specific_module.py index 73466ef0..dd937f9a 100644 --- a/selene_sdk/utils/non_strand_specific_module.py +++ b/selene_sdk/utils/non_strand_specific_module.py @@ -134,8 +134,8 @@ def forward(self, input): return ((output1 + output_from_rev1) / 2, (output2 + output_from_rev2) / 2) else: if self.mode == "mean": - return (output + output_from_rev) / 2 + return ((output + output_from_rev) / 2,) else: - return torch.max(output, output_from_rev) + return (torch.max(output, output_from_rev),) From 1ca2b3787913e4dfabb9cfdec0894671642efea2 Mon Sep 17 00:00:00 2001 From: Kathy Date: Thu, 14 Sep 2023 17:19:00 -0400 Subject: [PATCH 27/62] trying to figure out unet dataloader changes --- selene_sdk/samplers/dataloader.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/selene_sdk/samplers/dataloader.py b/selene_sdk/samplers/dataloader.py index 2b2bd7f2..94e04c53 100644 --- a/selene_sdk/samplers/dataloader.py +++ b/selene_sdk/samplers/dataloader.py @@ -188,7 +188,8 @@ def __init__(self, sequence_key="sequences", targets_key="targets", indicators_key=False, - use_seq_len=None): + use_seq_len=None, + shift=False): super(_H5Dataset, self).__init__() self.file_path = file_path self.in_memory = in_memory @@ -198,6 +199,7 @@ def __init__(self, self.unpackbits_tgt = unpackbits_tgt self.use_seq_len = use_seq_len + self.shift = shift self._seq_start, self._seq_end = None, None self._initialized = False @@ -262,11 +264,19 @@ def __getitem__(self, index): mid = (self._seq_end - self._seq_start) // 2 self._seq_start = mid - self.use_seq_len // 2 self._seq_end = int(mid + np.ceil(self.use_seq_len / 2)) + if self.shift: + diff = (len(sequence) - self.use_seq_len) // 2 + direction = np.random.choice([-1, 1]) + shift = np.random.choice(np.arange(diff+1)) + print(direction, shift) + self._seq_start = self._seq_start + direction * shift + self._seq_end = self._seq_end + direction * shift + print(self._seq_start, self._seq_end) sequence = sequence[self._seq_start:self._seq_end] - #if np.random.randint(2) == 1: # ONLY for unet - # sequence = np.flip(sequence, axis=-1) - # targets = np.flip(targets, axis=-1) + if np.random.randint(2) == 1: # ONLY for unet + sequence = np.flip(sequence, axis=-1) + targets = np.flip(targets, axis=-1) if self.indicators is not None: return (torch.from_numpy(sequence.astype(np.float32)), torch.from_numpy(targets.astype(np.float32)), From 5591680aaf055bf1948ffd5c350a27e33e2e3561 Mon Sep 17 00:00:00 2001 From: Kathy Date: Thu, 14 Sep 2023 17:23:35 -0400 Subject: [PATCH 28/62] trying to figure out unet dataloader changes - add tgt shift --- selene_sdk/samplers/dataloader.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/selene_sdk/samplers/dataloader.py b/selene_sdk/samplers/dataloader.py index 94e04c53..b9a68b8f 100644 --- a/selene_sdk/samplers/dataloader.py +++ b/selene_sdk/samplers/dataloader.py @@ -274,9 +274,12 @@ def __getitem__(self, index): print(self._seq_start, self._seq_end) sequence = sequence[self._seq_start:self._seq_end] - if np.random.randint(2) == 1: # ONLY for unet + # UNET ONLY # + targets = targets[:, self._seq_start:self._seq_end] + if np.random.randint(2) == 1: sequence = np.flip(sequence, axis=-1) targets = np.flip(targets, axis=-1) + # UNET ONLY # if self.indicators is not None: return (torch.from_numpy(sequence.astype(np.float32)), torch.from_numpy(targets.astype(np.float32)), From 3fa4bb0a69b52365dff7cafce1e6cf44ad305714 Mon Sep 17 00:00:00 2001 From: Kathy Date: Mon, 18 Sep 2023 08:48:36 -0400 Subject: [PATCH 29/62] remove print debug statements --- selene_sdk/samplers/dataloader.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/selene_sdk/samplers/dataloader.py b/selene_sdk/samplers/dataloader.py index b9a68b8f..40986304 100644 --- a/selene_sdk/samplers/dataloader.py +++ b/selene_sdk/samplers/dataloader.py @@ -268,10 +268,8 @@ def __getitem__(self, index): diff = (len(sequence) - self.use_seq_len) // 2 direction = np.random.choice([-1, 1]) shift = np.random.choice(np.arange(diff+1)) - print(direction, shift) self._seq_start = self._seq_start + direction * shift self._seq_end = self._seq_end + direction * shift - print(self._seq_start, self._seq_end) sequence = sequence[self._seq_start:self._seq_end] # UNET ONLY # From 79af76963b57ed821d5b5db37335b1aeae3b921f Mon Sep 17 00:00:00 2001 From: Kathy Date: Tue, 19 Sep 2023 10:53:07 -0400 Subject: [PATCH 30/62] addr memory issue in eval --- selene_sdk/evaluate_methylation_only.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/selene_sdk/evaluate_methylation_only.py b/selene_sdk/evaluate_methylation_only.py index edccfe7d..6525e6d8 100644 --- a/selene_sdk/evaluate_methylation_only.py +++ b/selene_sdk/evaluate_methylation_only.py @@ -258,11 +258,10 @@ def evaluate(self): reg_loss = self.reg_criterion( predictions[~mask], targets[~mask]) - all_predictions.append(predictions) + all_predictions.append(predictions.data.cpu().numpy()) batch_losses.append(reg_loss.item()) - all_predictions = torch.vstack(all_predictions) - all_predictions = all_predictions.data.cpu().numpy() + all_predictions = np.vstack(all_predictions) reg_scores = self._metrics.update( all_predictions, self._all_test_targets, scores=['pearsonr', 'spearmanr']) From e0d0383709397eed6d7c4458a6f49a1a1e6f05c0 Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 20 Sep 2023 13:43:15 -0400 Subject: [PATCH 31/62] comment --- selene_sdk/samplers/dataloader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/selene_sdk/samplers/dataloader.py b/selene_sdk/samplers/dataloader.py index 40986304..8d194132 100644 --- a/selene_sdk/samplers/dataloader.py +++ b/selene_sdk/samplers/dataloader.py @@ -273,10 +273,10 @@ def __getitem__(self, index): sequence = sequence[self._seq_start:self._seq_end] # UNET ONLY # - targets = targets[:, self._seq_start:self._seq_end] - if np.random.randint(2) == 1: - sequence = np.flip(sequence, axis=-1) - targets = np.flip(targets, axis=-1) + #targets = targets[:, self._seq_start:self._seq_end] + #if np.random.randint(2) == 1: + # sequence = np.flip(sequence, axis=-1) + # targets = np.flip(targets, axis=-1) # UNET ONLY # if self.indicators is not None: return (torch.from_numpy(sequence.astype(np.float32)), From 4ce9c994239555918db8a965177a01a47a3856b5 Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 18 Oct 2023 12:10:27 -0400 Subject: [PATCH 32/62] troubleshooting dataloader shift --- selene_sdk/samplers/dataloader.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/selene_sdk/samplers/dataloader.py b/selene_sdk/samplers/dataloader.py index 8d194132..b9497a26 100644 --- a/selene_sdk/samplers/dataloader.py +++ b/selene_sdk/samplers/dataloader.py @@ -189,7 +189,7 @@ def __init__(self, targets_key="targets", indicators_key=False, use_seq_len=None, - shift=False): + shift=None): super(_H5Dataset, self).__init__() self.file_path = file_path self.in_memory = in_memory @@ -262,14 +262,19 @@ def __getitem__(self, index): if self.use_seq_len is not None: mid = (self._seq_end - self._seq_start) // 2 + mid = mid - 1 # specific to methylation + # adding some sort of shift, debug this?? self._seq_start = mid - self.use_seq_len // 2 self._seq_end = int(mid + np.ceil(self.use_seq_len / 2)) - if self.shift: - diff = (len(sequence) - self.use_seq_len) // 2 - direction = np.random.choice([-1, 1]) - shift = np.random.choice(np.arange(diff+1)) - self._seq_start = self._seq_start + direction * shift - self._seq_end = self._seq_end + direction * shift + if self.shift is not None: + self._seq_start += self.shift + self._seq_end += self.shift + # randomness for UNET + #diff = (len(sequence) - self.use_seq_len) // 2 + #direction = np.random.choice([-1, 1]) + #shift = np.random.choice(np.arange(diff+1)) + #self._seq_start = self._seq_start + direction * shift + #self._seq_end = self._seq_end + direction * shift sequence = sequence[self._seq_start:self._seq_end] # UNET ONLY # @@ -401,5 +406,6 @@ def worker_init_fn(worker_id): args["sampler"] = SubsetRandomSampler(use_subset) else: args["shuffle"] = shuffle + super(H5DataLoader, self).__init__(dataset, **args) From 3229569d6ebaa1a427ea41373ce49a3b675e7d42 Mon Sep 17 00:00:00 2001 From: Kathy Date: Fri, 20 Oct 2023 09:27:18 -0400 Subject: [PATCH 33/62] add strand arg to _retrieve --- selene_sdk/samplers/methylation_data_sampler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/selene_sdk/samplers/methylation_data_sampler.py b/selene_sdk/samplers/methylation_data_sampler.py index 1883a5c2..66c6f1c6 100644 --- a/selene_sdk/samplers/methylation_data_sampler.py +++ b/selene_sdk/samplers/methylation_data_sampler.py @@ -241,7 +241,7 @@ def _partition_genome_by_chromosome(self): self._sample_from_mode[mode]._replace( indices=indices, weights=weights) - def _retrieve(self, chrom, position): + def _retrieve(self, chrom, position, strand=None): bin_start = position - self._start_radius bin_end = position + self._end_radius #indicator = self.target.is_positive( @@ -255,7 +255,8 @@ def _retrieve(self, chrom, position): self._start_radius, self._end_radius, self._start_window_radius, self._end_window_radius,) return None - strand = self.STRAND_SIDES[random.randint(0, 1)] + if strand is None: + strand = self.STRAND_SIDES[random.randint(0, 1)] retrieved_seq = \ self.reference_sequence.get_encoding_from_coords( chrom, window_start, window_end, strand) From d09da28069916900432c009b3c31fff3d68026dd Mon Sep 17 00:00:00 2001 From: Kathy Date: Fri, 20 Oct 2023 11:29:23 -0400 Subject: [PATCH 34/62] shift testing --- selene_sdk/samplers/dataloader.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/selene_sdk/samplers/dataloader.py b/selene_sdk/samplers/dataloader.py index b9497a26..8be04df0 100644 --- a/selene_sdk/samplers/dataloader.py +++ b/selene_sdk/samplers/dataloader.py @@ -261,11 +261,11 @@ def __getitem__(self, index): self._seq_end = len(sequence) if self.use_seq_len is not None: - mid = (self._seq_end - self._seq_start) // 2 - mid = mid - 1 # specific to methylation - # adding some sort of shift, debug this?? - self._seq_start = mid - self.use_seq_len // 2 - self._seq_end = int(mid + np.ceil(self.use_seq_len / 2)) + mid = len(sequence) // 2 + self._seq_start = int(mid - np.ceil(self.use_seq_len / 2)) + self._seq_end = mid + self.use_seq_len // 2 + #- self.use_seq_len // 2 + #self._seq_end = int(mid + np.ceil(self.use_seq_len / 2)) if self.shift is not None: self._seq_start += self.shift self._seq_end += self.shift From 4f82951121dd2c97554f6636558f147df38517c1 Mon Sep 17 00:00:00 2001 From: Kathy Date: Tue, 7 Nov 2023 11:44:12 -0500 Subject: [PATCH 35/62] adjust casting of targets in dataloader --- selene_sdk/samplers/dataloader.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/selene_sdk/samplers/dataloader.py b/selene_sdk/samplers/dataloader.py index 8be04df0..0a77ce17 100644 --- a/selene_sdk/samplers/dataloader.py +++ b/selene_sdk/samplers/dataloader.py @@ -288,8 +288,9 @@ def __getitem__(self, index): torch.from_numpy(targets.astype(np.float32)), self.indicators[index]) else: - return (torch.from_numpy(sequence.astype(np.float32)), - torch.from_numpy(targets.astype(np.float32)),) + s = sequence.astype(np.float32) + #t = targets.astype(np.float32) + return (torch.from_numpy(s), torch.from_numpy(targets)) @init From cd390ddddd7d6fe6e004ace6d31246bb2c509643 Mon Sep 17 00:00:00 2001 From: Kathy Date: Sat, 27 Jan 2024 11:05:16 -0500 Subject: [PATCH 36/62] minor changes to eval and sampling --- selene_sdk/evaluate_methylation_only.py | 2 ++ selene_sdk/samplers/random_positions_sampler.py | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/selene_sdk/evaluate_methylation_only.py b/selene_sdk/evaluate_methylation_only.py index 6525e6d8..cdb850b8 100644 --- a/selene_sdk/evaluate_methylation_only.py +++ b/selene_sdk/evaluate_methylation_only.py @@ -275,6 +275,8 @@ def evaluate(self): with h5py.File(os.path.join(self.output_dir, 'test_dataset.N={0}.h5'.format( self.n_test_samples)), 'a') as fh: + if 'sequences' in fh: + del fh['sequences'] fh.create_dataset('sequences', data=np.vstack(sequences)) lossdict = { diff --git a/selene_sdk/samplers/random_positions_sampler.py b/selene_sdk/samplers/random_positions_sampler.py index fe2bf677..2dc8f493 100644 --- a/selene_sdk/samplers/random_positions_sampler.py +++ b/selene_sdk/samplers/random_positions_sampler.py @@ -236,7 +236,7 @@ def _partition_genome_by_chromosome(self): self._sample_from_mode[mode]._replace( indices=indices, weights=weights) - def _retrieve(self, chrom, position): + def _retrieve(self, chrom, position, strand=None): bin_start = position - self._start_radius bin_end = position + self._end_radius retrieved_targets = self.target.get_feature_data( @@ -248,7 +248,8 @@ def _retrieve(self, chrom, position): self._start_radius, self._end_radius, self._start_window_radius, self._end_window_radius,) return None - strand = self.STRAND_SIDES[random.randint(0, 1)] + if strand is None: + strand = self.STRAND_SIDES[random.randint(0, 1)] retrieved_seq = \ self.reference_sequence.get_encoding_from_coords( chrom, window_start, window_end, strand) From 240f406367ff52a1aafede99c28261cc6317a0b5 Mon Sep 17 00:00:00 2001 From: Kathy Date: Thu, 7 Mar 2024 16:10:38 -0500 Subject: [PATCH 37/62] remove unused code in nonstrandspecific module --- selene_sdk/utils/non_strand_specific_module.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/selene_sdk/utils/non_strand_specific_module.py b/selene_sdk/utils/non_strand_specific_module.py index dd937f9a..7565a607 100644 --- a/selene_sdk/utils/non_strand_specific_module.py +++ b/selene_sdk/utils/non_strand_specific_module.py @@ -67,8 +67,6 @@ def forward(self, input): else: reverse_input = _flip(_flip(input, 1), 2) - #output1, output2 = self.model.forward(input) - #output_from_rev1, output_from_rev2 = self.model.forward(reverse_input) output = self.model.forward(input) output_from_rev = self.model.forward(reverse_input) if type(output) == tuple: @@ -80,11 +78,6 @@ def forward(self, input): return (output + output_from_rev) / 2 else: return torch.max(output, output_from_rev) - #if self.mode == "mean": - # return ((output1 + output_from_rev1) / 2, (output2 + output_from_rev2) / 2) - #else: - # return torch.max(output1, output_from_rev1), torch.max(output2, output_from_rev2) - From b7f9968a226cbc1f5670d1a47441a8cb73c20d2c Mon Sep 17 00:00:00 2001 From: Kathy Date: Fri, 15 Mar 2024 15:39:06 -0400 Subject: [PATCH 38/62] remove indicators from dataloader, clean up methylation performance metrics --- selene_sdk/samplers/dataloader.py | 33 +++-------------- .../utils/methylation_performance_metrics.py | 37 +------------------ 2 files changed, 8 insertions(+), 62 deletions(-) diff --git a/selene_sdk/samplers/dataloader.py b/selene_sdk/samplers/dataloader.py index 0a77ce17..a3ccf2e2 100644 --- a/selene_sdk/samplers/dataloader.py +++ b/selene_sdk/samplers/dataloader.py @@ -56,12 +56,12 @@ def __getitem__(self, index): The shape of `targets` will be :math:`I \\times T`, where :math:`T` is the number of targets predicted. """ - sequences, targets, inds = self.sampler.sample( + sequences, targets = self.sampler.sample( batch_size=1 if isinstance(index, int) else len(index)) if sequences.shape[0] == 1: sequences = sequences[0,:] targets = targets[0,:] - return sequences, targets, inds + return sequences, targets def __len__(self): """ @@ -187,7 +187,6 @@ def __init__(self, unpackbits_tgt=False, sequence_key="sequences", targets_key="targets", - indicators_key=False, use_seq_len=None, shift=None): super(_H5Dataset, self).__init__() @@ -205,7 +204,6 @@ def __init__(self, self._initialized = False self._sequence_key = sequence_key self._targets_key = targets_key - self._indicators_key = indicators_key def init(func): # delay initialization to allow multiprocessing @@ -213,9 +211,6 @@ def init(func): def dfunc(self, *args, **kwargs): if not self._initialized: self.db = h5py.File(self.file_path, 'r') - key = 'indicator' - if key not in self.db and self._indicators_key: - key = 'indicators' if self.unpackbits: self.s_len = self.db['{0}_length'.format(self._sequence_key)][()] @@ -228,15 +223,10 @@ def dfunc(self, *args, **kwargs): if self.in_memory: self.sequences = np.asarray(self.db[self._sequence_key]) self.targets = np.asarray(self.db[self._targets_key]) - self.indicators = None - if self._indicators_key: - self.indicators = np.asarray(self.db[key]) else: self.sequences = self.db[self._sequence_key] self.targets = self.db[self._targets_key] - self.indicators = None - if self._indicators_key: - self.indicators = self.db[key] + self._initialized = True return func(self, *args, **kwargs) return dfunc @@ -277,20 +267,9 @@ def __getitem__(self, index): #self._seq_end = self._seq_end + direction * shift sequence = sequence[self._seq_start:self._seq_end] - # UNET ONLY # - #targets = targets[:, self._seq_start:self._seq_end] - #if np.random.randint(2) == 1: - # sequence = np.flip(sequence, axis=-1) - # targets = np.flip(targets, axis=-1) - # UNET ONLY # - if self.indicators is not None: - return (torch.from_numpy(sequence.astype(np.float32)), - torch.from_numpy(targets.astype(np.float32)), - self.indicators[index]) - else: - s = sequence.astype(np.float32) - #t = targets.astype(np.float32) - return (torch.from_numpy(s), torch.from_numpy(targets)) + s = sequence.astype(np.float32) + #t = targets.astype(np.float32) + return (torch.from_numpy(s), torch.from_numpy(targets)) @init diff --git a/selene_sdk/utils/methylation_performance_metrics.py b/selene_sdk/utils/methylation_performance_metrics.py index 0142fee1..0a8ea114 100644 --- a/selene_sdk/utils/methylation_performance_metrics.py +++ b/selene_sdk/utils/methylation_performance_metrics.py @@ -7,10 +7,6 @@ import os import numpy as np -from sklearn.metrics import average_precision_score -from sklearn.metrics import precision_recall_curve -from sklearn.metrics import roc_auc_score -from sklearn.metrics import roc_curve from scipy.stats import rankdata @@ -124,34 +120,6 @@ def get_feature_specific_scores(data, get_feature_from_index_fn): return feature_score_dict -def auc_u_test(labels, predictions): - """ - Outputs the area under the the ROC curve associated with a certain - set of labels and the predictions given by the training model. - Computed from the U statistic. - - Parameters - ---------- - labels: numpy.ndarray - Known labels of values predicted by model. Must be one dimensional. - predictions: numpy.ndarray - Value predicted by user model. Must be one dimensional, with matching - dimension to `labels` - - Returns - ------- - float - AUC value of given label, prediction pairs - - """ - len_pos = int(np.sum(labels)) - len_neg = len(labels) - len_pos - rank_sum = np.sum(rankdata(predictions)[labels == 1]) - u_value = rank_sum - (len_pos * (len_pos + 1)) / 2 - auc = u_value / (len_pos * len_neg) - return auc - - class MethylationPerformanceMetrics(object): """ Tracks and calculates metrics to evaluate how closely a model's @@ -191,9 +159,8 @@ class MethylationPerformanceMetrics(object): def __init__(self, get_feature_from_index_fn, - report_gt_feature_n_positives=10, - metrics=dict(roc_auc=roc_auc_score, - average_precision=average_precision_score)): + metrics, + report_gt_feature_n_positives=10): """ Creates a new object of the `PerformanceMetrics` class. """ From eebe14b07119cf9fef8d2b2c84afe9c6c0332ce9 Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 1 May 2024 15:18:54 -0400 Subject: [PATCH 39/62] remove unused files from version control, adjust multi sampler get dataset batches function --- selene_sdk/evaluate_methylation_interact.py | 295 -------------------- selene_sdk/evaluate_methylation_only.py | 293 ------------------- selene_sdk/samplers/multi_sampler.py | 24 +- 3 files changed, 3 insertions(+), 609 deletions(-) delete mode 100644 selene_sdk/evaluate_methylation_interact.py delete mode 100644 selene_sdk/evaluate_methylation_only.py diff --git a/selene_sdk/evaluate_methylation_interact.py b/selene_sdk/evaluate_methylation_interact.py deleted file mode 100644 index 60722326..00000000 --- a/selene_sdk/evaluate_methylation_interact.py +++ /dev/null @@ -1,295 +0,0 @@ -""" -This module provides the EvaluateModel class. -""" -import logging -import os -import warnings - -import h5py -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -from scipy.stats import pearsonr, spearmanr -from sklearn.metrics import average_precision_score, roc_auc_score -from sklearn.metrics import mean_squared_error - -from .sequences import Genome -from .utils import (PerformanceMetrics, _is_lua_trained_model, - initialize_logger, load_model_from_state_dict) -from .utils.methylation_performance_metrics import MethylationPerformanceMetrics - -logger = logging.getLogger("selene") - - -class EvaluateMethylationOnly(object): - """ - Evaluate model on a test set of sequences with known targets. - - Parameters - ---------- - model : torch.nn.Module - The model architecture. - criterion : torch.nn._Loss - The loss function that was optimized during training. - data_sampler : selene_sdk.samplers.Sampler - Used to retrieve samples from the test set for evaluation. - features : list(str) - List of distinct features the model predicts. - trained_model_path : str - Path to the trained model file, saved using `torch.save`. - output_dir : str - The output directory in which to save model evaluation and logs. - batch_size : int, optional - Default is 64. Specify the batch size to process examples. - Should be a power of 2. - n_test_samples : int or None, optional - Default is `None`. Use `n_test_samples` if you want to limit the - number of samples on which you evaluate your model. If you are - using a sampler of type `selene_sdk.samplers.OnlineSampler`, - by default it will draw 640000 samples if `n_test_samples` is `None`. - report_gt_feature_n_positives : int, optional - Default is 10. In the final test set, each class/feature must have - more than `report_gt_feature_n_positives` positive samples in order to - be considered in the test performance computation. The output file that - states each class' performance will report 'NA' for classes that do - not have enough positive samples. - use_cuda : bool, optional - Default is `False`. Specify whether a CUDA-enabled GPU is available - for torch to use during training. - data_parallel : bool, optional - Default is `False`. Specify whether multiple GPUs are available - for torch to use during training. - use_features_ord : list(str) or None, optional - Default is None. Specify an ordered list of features for which to - run the evaluation. The features in this list must be identical to or - a subset of `features`, and in the order you want the resulting - `test_targets.npz` and `test_predictions.npz` to be saved. If using - a FileSampler or H5DataLoader for the evaluation, you can pass in - a dataset with the targets matrix only containing these features, but - note that this subsetted targets matrix MUST be ordered the same - way as `features`, and the predictions and targets .npz output - will be reordered according to `use_features_ord`. - - Attributes - ---------- - model : torch.nn.Module - The trained model. - criterion : torch.nn._Loss - The model was trained using this loss function. - sampler : selene_sdk.samplers.Sampler - The example generator. - features : list(str) - List of distinct features the model predicts. - batch_size : int - The batch size to process examples. Should be a power of 2. - use_cuda : bool - If `True`, use a CUDA-enabled GPU. If `False`, use the CPU. - data_parallel : bool - Whether to use multiple GPUs or not. - metrics : dict - A dictionary that maps metric names (`str`) to metric functions. - By default, this contains `"roc_auc"`, which maps to - `sklearn.metrics.roc_auc_score`, and `"average_precision"`, - which maps to `sklearn.metrics.average_precision_score`. - - """ - - def __init__(self, - model, - data_sampler, - features, - trained_model_path, - output_dir, - batch_size=64, - n_test_samples=None, - report_gt_feature_n_positives=10, - use_cuda=False, - data_parallel=False, - use_features_ord=None, - metrics=dict(roc_auc=roc_auc_score, - average_precision=average_precision_score)): - self.reg_criterion = F.mse_loss - - trained_model = torch.load( - trained_model_path, map_location=lambda storage, location: storage) - if "state_dict" in trained_model: - self.model = load_model_from_state_dict( - trained_model["state_dict"], model) - else: - self.model = load_model_from_state_dict( - trained_model, model) - self.model.eval() - - self.sampler = data_sampler - - self.output_dir = output_dir - os.makedirs(output_dir, exist_ok=True) - - self.features = np.array(features) - self._use_ixs = list(range(len(features))) - if use_features_ord is not None: - feature_ixs = {f: ix for (ix, f) in enumerate(features)} - self._use_ixs = [] - for f in use_features_ord: - if f in feature_ixs: - self._use_ixs.append(feature_ixs[f]) - else: - warnings.warn(("Feature {0} in `use_features_ord` " - "does not match any features in the list " - "`features` and will be skipped.").format(f)) - self._write_features_ordered_to_file() - - initialize_logger( - os.path.join(self.output_dir, "{0}.log".format( - __name__)), - verbosity=2) - - self.data_parallel = data_parallel - if self.data_parallel: - self.model = nn.DataParallel(model) - logger.debug("Wrapped model in DataParallel") - - self.use_cuda = use_cuda - if self.use_cuda: - self.model.cuda() - - self.batch_size = batch_size - - metrics = dict(spearmanr=spearmanr, pearsonr=pearsonr) - self._metrics = MethylationPerformanceMetrics( - self._get_feature_from_index, - report_gt_feature_n_positives=report_gt_feature_n_positives, - metrics=metrics) - - self.n_test_samples = n_test_samples - self._test_data, self._all_test_targets = \ - self.sampler.get_data_and_targets(self.batch_size, n_test_samples) - - self._use_testmat_ixs = self._use_ixs[:] - # if the targets shape is the same as the subsetted features, - # reindex based on the subsetted list - if self._all_test_targets.shape[1] == len(self._use_ixs): - subset_features = {self.features[ix]: i for (i, ix) in - enumerate(sorted(self._use_ixs))} - self._use_testmat_ixs = [ - subset_features[f] for f in self.features[self._use_ixs]] - - #self._all_test_targets = self._all_test_targets[ - # :, self._use_testmat_ixs] - - # save the targets dataset now - with h5py.File(os.path.join(self.output_dir, 'test_dataset.N={0}.h5'.format( - self.n_test_samples)), 'a') as fh: - if 'targets' in fh: - del fh['targets'] - fh.create_dataset('targets', data=self._all_test_targets) - - # reset Genome base ordering when applicable. - if (hasattr(self.sampler, "reference_sequence") and - isinstance(self.sampler.reference_sequence, Genome)): - if _is_lua_trained_model(model): - Genome.update_bases_order(['A', 'G', 'C', 'T']) - else: - Genome.update_bases_order(['A', 'C', 'G', 'T']) - - def _write_features_ordered_to_file(self): - """ - Write the feature ordering specified by `use_features_ord` - after matching it with the `features` list from the class - initialization parameters. - """ - fp = os.path.join(self.output_dir, 'use_features_ord.txt') - with open(fp, 'w+') as file_handle: - for f in self.features: #[self._use_ixs]: - file_handle.write('{0}\n'.format(f)) - - def _get_feature_from_index(self, index): - """ - Gets the feature at an index in the features list. - - Parameters - ---------- - index : int - - Returns - ------- - str - The name of the feature/target at the specified index. - - """ - return self.features[index] #[self._use_ixs][index] - - def evaluate(self): - """ - Passes all samples retrieved from the sampler to the model in - batches and returns the predictions. Also reports the model's - performance on these examples. - - Returns - ------- - dict - A dictionary, where keys are the features and the values are - each a dict of the performance metrics (currently ROC AUC and - AUPR) reported for each feature the model predicts. - - """ - batch_losses = [] - sequences = [] - all_predictions = [] - for (inputs, targets) in self._test_data: - sequences.append(inputs) - inputs = torch.Tensor(inputs).int() - #inputs = torch.Tensor(inputs) - targets = torch.Tensor(targets) #[:, self._use_testmat_ixs]) - if self.use_cuda: - inputs = inputs.cuda() - targets = targets.cuda() - with torch.no_grad(): - predictions = None - if _is_lua_trained_model(self.model): - predictions = self.model.forward( - inputs.transpose(1, 2).contiguous().unsqueeze_(2)) - else: - predictions = self.model.forward( - inputs)[0] - #inputs.transpose(1, 2)) - #print(type(targets), type(predictions)) - #print(targets.size(), predictions.size()) - mask = torch.isnan(targets) - #print(mask) - reg_loss = self.reg_criterion( - predictions[~mask], targets[~mask]) - - all_predictions.append(predictions) - batch_losses.append(reg_loss.item()) - - all_predictions = torch.vstack(all_predictions) - all_predictions = all_predictions.data.cpu().numpy() - - reg_scores = self._metrics.update( - all_predictions, self._all_test_targets, scores=['pearsonr', 'spearmanr']) - for name, score in reg_scores.items(): - logger.info("test {0}: {1}".format(name, score)) - - np.savez_compressed( - os.path.join(self.output_dir, "test_predictions.N={0}.npz".format( - self.n_test_samples)), - data=all_predictions) - - with h5py.File(os.path.join(self.output_dir, 'test_dataset.N={0}.h5'.format( - self.n_test_samples)), 'a') as fh: - fh.create_dataset('sequences', data=np.vstack(sequences)) - - lossdict = { - 'loss': np.average(batch_losses) - } - for name, score in lossdict.items(): - logger.info("test {0}: {1}".format(name, score)) - - test_performance = os.path.join( - self.output_dir, "test_performance.txt") - feature_scores_dict = self._metrics.write_feature_scores_to_file( - test_performance) - - return feature_scores_dict diff --git a/selene_sdk/evaluate_methylation_only.py b/selene_sdk/evaluate_methylation_only.py deleted file mode 100644 index cdb850b8..00000000 --- a/selene_sdk/evaluate_methylation_only.py +++ /dev/null @@ -1,293 +0,0 @@ -""" -This module provides the EvaluateModel class. -""" -import logging -import os -import warnings - -import h5py -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -from scipy.stats import pearsonr, spearmanr -from sklearn.metrics import average_precision_score, roc_auc_score -from sklearn.metrics import mean_squared_error - -from .sequences import Genome -from .utils import (PerformanceMetrics, _is_lua_trained_model, - initialize_logger, load_model_from_state_dict) -from .utils.methylation_performance_metrics import MethylationPerformanceMetrics - -logger = logging.getLogger("selene") - - -class EvaluateMethylationOnly(object): - """ - Evaluate model on a test set of sequences with known targets. - - Parameters - ---------- - model : torch.nn.Module - The model architecture. - criterion : torch.nn._Loss - The loss function that was optimized during training. - data_sampler : selene_sdk.samplers.Sampler - Used to retrieve samples from the test set for evaluation. - features : list(str) - List of distinct features the model predicts. - trained_model_path : str - Path to the trained model file, saved using `torch.save`. - output_dir : str - The output directory in which to save model evaluation and logs. - batch_size : int, optional - Default is 64. Specify the batch size to process examples. - Should be a power of 2. - n_test_samples : int or None, optional - Default is `None`. Use `n_test_samples` if you want to limit the - number of samples on which you evaluate your model. If you are - using a sampler of type `selene_sdk.samplers.OnlineSampler`, - by default it will draw 640000 samples if `n_test_samples` is `None`. - report_gt_feature_n_positives : int, optional - Default is 10. In the final test set, each class/feature must have - more than `report_gt_feature_n_positives` positive samples in order to - be considered in the test performance computation. The output file that - states each class' performance will report 'NA' for classes that do - not have enough positive samples. - use_cuda : bool, optional - Default is `False`. Specify whether a CUDA-enabled GPU is available - for torch to use during training. - data_parallel : bool, optional - Default is `False`. Specify whether multiple GPUs are available - for torch to use during training. - use_features_ord : list(str) or None, optional - Default is None. Specify an ordered list of features for which to - run the evaluation. The features in this list must be identical to or - a subset of `features`, and in the order you want the resulting - `test_targets.npz` and `test_predictions.npz` to be saved. If using - a FileSampler or H5DataLoader for the evaluation, you can pass in - a dataset with the targets matrix only containing these features, but - note that this subsetted targets matrix MUST be ordered the same - way as `features`, and the predictions and targets .npz output - will be reordered according to `use_features_ord`. - - Attributes - ---------- - model : torch.nn.Module - The trained model. - criterion : torch.nn._Loss - The model was trained using this loss function. - sampler : selene_sdk.samplers.Sampler - The example generator. - features : list(str) - List of distinct features the model predicts. - batch_size : int - The batch size to process examples. Should be a power of 2. - use_cuda : bool - If `True`, use a CUDA-enabled GPU. If `False`, use the CPU. - data_parallel : bool - Whether to use multiple GPUs or not. - metrics : dict - A dictionary that maps metric names (`str`) to metric functions. - By default, this contains `"roc_auc"`, which maps to - `sklearn.metrics.roc_auc_score`, and `"average_precision"`, - which maps to `sklearn.metrics.average_precision_score`. - - """ - - def __init__(self, - model, - data_sampler, - features, - trained_model_path, - output_dir, - batch_size=64, - n_test_samples=None, - report_gt_feature_n_positives=10, - use_cuda=False, - data_parallel=False, - use_features_ord=None, - metrics=dict(roc_auc=roc_auc_score, - average_precision=average_precision_score)): - self.reg_criterion = F.mse_loss - - trained_model = torch.load( - trained_model_path, map_location=lambda storage, location: storage) - if "state_dict" in trained_model: - self.model = load_model_from_state_dict( - trained_model["state_dict"], model) - else: - self.model = load_model_from_state_dict( - trained_model, model) - self.model.eval() - - self.sampler = data_sampler - - self.output_dir = output_dir - os.makedirs(output_dir, exist_ok=True) - - self.features = np.array(features) - self._use_ixs = list(range(len(features))) - if use_features_ord is not None: - feature_ixs = {f: ix for (ix, f) in enumerate(features)} - self._use_ixs = [] - for f in use_features_ord: - if f in feature_ixs: - self._use_ixs.append(feature_ixs[f]) - else: - warnings.warn(("Feature {0} in `use_features_ord` " - "does not match any features in the list " - "`features` and will be skipped.").format(f)) - self._write_features_ordered_to_file() - - initialize_logger( - os.path.join(self.output_dir, "{0}.log".format( - __name__)), - verbosity=2) - - self.data_parallel = data_parallel - if self.data_parallel: - self.model = nn.DataParallel(model) - logger.debug("Wrapped model in DataParallel") - - self.use_cuda = use_cuda - if self.use_cuda: - self.model.cuda() - - self.batch_size = batch_size - - metrics = dict(spearmanr=spearmanr, pearsonr=pearsonr) - self._metrics = MethylationPerformanceMetrics( - self._get_feature_from_index, - report_gt_feature_n_positives=report_gt_feature_n_positives, - metrics=metrics) - - self.n_test_samples = n_test_samples - self._test_data, self._all_test_targets = \ - self.sampler.get_data_and_targets(self.batch_size, n_test_samples) - - self._use_testmat_ixs = self._use_ixs[:] - # if the targets shape is the same as the subsetted features, - # reindex based on the subsetted list - if self._all_test_targets.shape[1] == len(self._use_ixs): - subset_features = {self.features[ix]: i for (i, ix) in - enumerate(sorted(self._use_ixs))} - self._use_testmat_ixs = [ - subset_features[f] for f in self.features[self._use_ixs]] - - #self._all_test_targets = self._all_test_targets[ - # :, self._use_testmat_ixs] - - # save the targets dataset now - with h5py.File(os.path.join(self.output_dir, 'test_dataset.N={0}.h5'.format( - self.n_test_samples)), 'a') as fh: - if 'targets' in fh: - del fh['targets'] - fh.create_dataset('targets', data=self._all_test_targets) - - # reset Genome base ordering when applicable. - if (hasattr(self.sampler, "reference_sequence") and - isinstance(self.sampler.reference_sequence, Genome)): - if _is_lua_trained_model(model): - Genome.update_bases_order(['A', 'G', 'C', 'T']) - else: - Genome.update_bases_order(['A', 'C', 'G', 'T']) - - def _write_features_ordered_to_file(self): - """ - Write the feature ordering specified by `use_features_ord` - after matching it with the `features` list from the class - initialization parameters. - """ - fp = os.path.join(self.output_dir, 'use_features_ord.txt') - with open(fp, 'w+') as file_handle: - for f in self.features: #[self._use_ixs]: - file_handle.write('{0}\n'.format(f)) - - def _get_feature_from_index(self, index): - """ - Gets the feature at an index in the features list. - - Parameters - ---------- - index : int - - Returns - ------- - str - The name of the feature/target at the specified index. - - """ - return self.features[index] #[self._use_ixs][index] - - def evaluate(self): - """ - Passes all samples retrieved from the sampler to the model in - batches and returns the predictions. Also reports the model's - performance on these examples. - - Returns - ------- - dict - A dictionary, where keys are the features and the values are - each a dict of the performance metrics (currently ROC AUC and - AUPR) reported for each feature the model predicts. - - """ - batch_losses = [] - sequences = [] - all_predictions = [] - for (inputs, targets) in self._test_data: - sequences.append(inputs) - #inputs = torch.Tensor(inputs).int() - inputs = torch.Tensor(inputs) - targets = torch.Tensor(targets) #[:, self._use_testmat_ixs]) - if self.use_cuda: - inputs = inputs.cuda() - targets = targets.cuda() - with torch.no_grad(): - predictions = None - if _is_lua_trained_model(self.model): - predictions = self.model.forward( - inputs.transpose(1, 2).contiguous().unsqueeze_(2)) - else: - predictions = self.model.forward( - #inputs)[0] - inputs.transpose(1, 2)) - mask = torch.isnan(targets) - reg_loss = self.reg_criterion( - predictions[~mask], targets[~mask]) - - all_predictions.append(predictions.data.cpu().numpy()) - batch_losses.append(reg_loss.item()) - - all_predictions = np.vstack(all_predictions) - - reg_scores = self._metrics.update( - all_predictions, self._all_test_targets, scores=['pearsonr', 'spearmanr']) - for name, score in reg_scores.items(): - logger.info("test {0}: {1}".format(name, score)) - - np.savez_compressed( - os.path.join(self.output_dir, "test_predictions.N={0}.npz".format( - self.n_test_samples)), - data=all_predictions) - - with h5py.File(os.path.join(self.output_dir, 'test_dataset.N={0}.h5'.format( - self.n_test_samples)), 'a') as fh: - if 'sequences' in fh: - del fh['sequences'] - fh.create_dataset('sequences', data=np.vstack(sequences)) - - lossdict = { - 'loss': np.average(batch_losses) - } - for name, score in lossdict.items(): - logger.info("test {0}: {1}".format(name, score)) - - test_performance = os.path.join( - self.output_dir, "test_performance.txt") - feature_scores_dict = self._metrics.write_feature_scores_to_file( - test_performance) - - return feature_scores_dict diff --git a/selene_sdk/samplers/multi_sampler.py b/selene_sdk/samplers/multi_sampler.py index 1cb6d751..1fb8eb40 100644 --- a/selene_sdk/samplers/multi_sampler.py +++ b/selene_sdk/samplers/multi_sampler.py @@ -265,30 +265,12 @@ def get_data_and_targets(self, batch_size, n_samples=None, mode=None): self._set_batch_size(batch_size, mode=mode) data_and_targets = [] targets_mat = [] - #ind_mat = [] - count = batch_size - while count < n_samples: - #data, tgts, ind = self.sample(batch_size=batch_size, mode=mode) - #data_and_targets.append((data, tgts, ind)) - - output = self.sample(batch_size=batch_size, mode=mode) + for s in range(0, n_samples, batch_size): + e = min(n_samples, s+batch_size) + output = self.sample(batch_size=e-s, mode=mode) data_and_targets.append(output) targets_mat.append(output[1]) - #ind_mat.append(output[2]) - count += batch_size - remainder = batch_size - (count - n_samples) - #data, tgts, ind = self.sample(batch_size=remainder, mode=mode) - #data_and_targets.append((data, tgts, ind)) - #targets_mat.append(tgts) - #ind_mat.append(ind) - output = self.sample(batch_size=remainder, mode=mode) - data_and_targets.append(output) - targets_mat.append(output[1]) targets_mat = np.vstack(targets_mat) - - #ind_mat.append(output[2]) - #ind_mat = np.hstack(ind_mat) - #return data_and_targets, targets_mat, ind_mat return data_and_targets, targets_mat def get_validation_set(self, batch_size, n_samples=None): From 152ad61e1f1724f239acfc68f5f4b2fe8ed57002 Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 1 May 2024 16:22:22 -0400 Subject: [PATCH 40/62] remove commented out code in shift sections --- selene_sdk/samplers/dataloader.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/selene_sdk/samplers/dataloader.py b/selene_sdk/samplers/dataloader.py index a3ccf2e2..02d4587f 100644 --- a/selene_sdk/samplers/dataloader.py +++ b/selene_sdk/samplers/dataloader.py @@ -254,21 +254,12 @@ def __getitem__(self, index): mid = len(sequence) // 2 self._seq_start = int(mid - np.ceil(self.use_seq_len / 2)) self._seq_end = mid + self.use_seq_len // 2 - #- self.use_seq_len // 2 - #self._seq_end = int(mid + np.ceil(self.use_seq_len / 2)) if self.shift is not None: self._seq_start += self.shift self._seq_end += self.shift - # randomness for UNET - #diff = (len(sequence) - self.use_seq_len) // 2 - #direction = np.random.choice([-1, 1]) - #shift = np.random.choice(np.arange(diff+1)) - #self._seq_start = self._seq_start + direction * shift - #self._seq_end = self._seq_end + direction * shift sequence = sequence[self._seq_start:self._seq_end] s = sequence.astype(np.float32) - #t = targets.astype(np.float32) return (torch.from_numpy(s), torch.from_numpy(targets)) From 6fc0f7bd21f8bd632512c4bb9f38b5eb099bb338 Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 1 May 2024 16:39:15 -0400 Subject: [PATCH 41/62] remove ind commented out code in multisampler --- selene_sdk/samplers/multi_sampler.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/selene_sdk/samplers/multi_sampler.py b/selene_sdk/samplers/multi_sampler.py index 1fb8eb40..c24a1e98 100644 --- a/selene_sdk/samplers/multi_sampler.py +++ b/selene_sdk/samplers/multi_sampler.py @@ -218,16 +218,12 @@ def sample(self, batch_size=1, mode=None): try: data, targets = next(self._iterators[mode]) return data.numpy(), targets.numpy() - #data, targets, ind = next(self._iterators[mode]) - #return data.numpy(), targets.numpy(), ind.numpy() except StopIteration: #If DataLoader iterator reaches its length, reinitialize self._iterators[mode] = iter(self._dataloaders[mode]) data, targets = next(self._iterators[mode]) return data.numpy(), targets.numpy() - #data, targets, ind = next(self._iterators[mode]) - #return data.numpy(), targets.numpy(), ind.numpy() def get_data_and_targets(self, batch_size, n_samples=None, mode=None): """ From fa693453a2f6d7db82837fd7ff75235e3ec97741 Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 1 May 2024 16:40:14 -0400 Subject: [PATCH 42/62] add excl chr optional arg --- .../samplers/random_positions_sampler.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/selene_sdk/samplers/random_positions_sampler.py b/selene_sdk/samplers/random_positions_sampler.py index 2dc8f493..dedf2442 100644 --- a/selene_sdk/samplers/random_positions_sampler.py +++ b/selene_sdk/samplers/random_positions_sampler.py @@ -125,6 +125,7 @@ def __init__(self, seed=436, validation_holdout=['chr6', 'chr7'], test_holdout=['chr8', 'chr9'], + exclude_chrs=['_'], sequence_length=1000, center_bin_to_predict=200, feature_thresholds=0.5, @@ -155,6 +156,8 @@ def __init__(self, self.interval_lengths = [] self._initialized = False + self.exclude_chrs = exclude_chrs + def init(func): # delay initialization to allow multiprocessing @wraps(func) @@ -174,6 +177,14 @@ def dfunc(self, *args, **kwargs): def _partition_genome_by_proportion(self): for chrom, len_chrom in self.reference_sequence.get_chr_lens(): + skip = False + for excl in self.exclude_chrs: + if excl in chrom: + skip = True + break + if skip: + logger.debug('Skipping chromosome {0}'.format(chrom)) + continue self.sample_from_intervals.append( (chrom, self.sequence_length, @@ -212,6 +223,14 @@ def _partition_genome_by_chromosome(self): for mode in self.modes: self._sample_from_mode[mode] = SampleIndices([], []) for index, (chrom, len_chrom) in enumerate(self.reference_sequence.get_chr_lens()): + skip = False + for excl in self.exclude_chrs: + if excl in chrom: + skip = True + break + if skip: + logger.debug('Skipping chromosome {0}'.format(chrom)) + continue if chrom in self.validation_holdout: self._sample_from_mode["validate"].indices.append( index) From 3adbf237f2c62834b7df92b0468b2fcb4768c163 Mon Sep 17 00:00:00 2001 From: Kathy Date: Mon, 6 May 2024 15:54:55 -0400 Subject: [PATCH 43/62] make adjustments to nonstrandspecific, performancemetrics, for what can be generalized from the methylation specific code --- .../samplers/methylation_data_sampler.py | 3 -- .../utils/non_strand_specific_module.py | 52 ------------------- selene_sdk/utils/performance_metrics.py | 8 ++- 3 files changed, 6 insertions(+), 57 deletions(-) diff --git a/selene_sdk/samplers/methylation_data_sampler.py b/selene_sdk/samplers/methylation_data_sampler.py index 66c6f1c6..d6ba6df8 100644 --- a/selene_sdk/samplers/methylation_data_sampler.py +++ b/selene_sdk/samplers/methylation_data_sampler.py @@ -1,8 +1,5 @@ """ This module provides the RandomPositionsSampler class. - -TODO: Currently, only works with sequences from `selene_sdk.sequences.Genome`. -We would like to generalize this to `selene_sdk.sequences.Sequence` if possible. """ from collections import namedtuple import logging diff --git a/selene_sdk/utils/non_strand_specific_module.py b/selene_sdk/utils/non_strand_specific_module.py index 7565a607..9d0ecd82 100644 --- a/selene_sdk/utils/non_strand_specific_module.py +++ b/selene_sdk/utils/non_strand_specific_module.py @@ -80,55 +80,3 @@ def forward(self, input): return torch.max(output, output_from_rev) - -class NonStrandSpecificInteract(Module): - """ - A torch.nn.Module that wraps a user-specified model architecture if the - architecture does not need to account for sequence strand-specificity. - - Parameters - ---------- - model : torch.nn.Module - The user-specified model architecture. - mode : {'mean', 'max'}, optional - Default is 'mean'. NonStrandSpecific will pass the input and the - reverse-complement of the input into `model`. The mode specifies - whether we should output the mean or max of the predictions as - the non-strand specific prediction. - - Attributes - ---------- - model : torch.nn.Module - The user-specified model architecture. - mode : {'mean', 'max'} - How to handle outputting a non-strand specific prediction. - - """ - - def __init__(self, model, mode="mean"): - super(NonStrandSpecificInteract, self).__init__() - - self.model = model - - if mode != "mean" and mode != "max": - raise ValueError("Mode should be one of 'mean' or 'max' but was" - "{0}.".format(mode)) - self.mode = mode - - def forward(self, input): - reverse_input = 5 - input - reverse_input[reverse_input == 5] = 0 - - output = self.model.forward(input)[0] - output_from_rev = self.model.forward(reverse_input)[0] - if type(output) == tuple: - output1, output2 = output - output_from_rev1, output_from_rev2 = output_from_rev - return ((output1 + output_from_rev1) / 2, (output2 + output_from_rev2) / 2) - else: - if self.mode == "mean": - return ((output + output_from_rev) / 2,) - else: - return (torch.max(output, output_from_rev),) - - diff --git a/selene_sdk/utils/performance_metrics.py b/selene_sdk/utils/performance_metrics.py index 23aa4606..f7f29abf 100644 --- a/selene_sdk/utils/performance_metrics.py +++ b/selene_sdk/utils/performance_metrics.py @@ -207,11 +207,15 @@ def compute_score(prediction, target, metric_fn, prediction = prediction.T for index, feature_preds in enumerate(prediction): feature_targets = target[:, index] + feature_preds = feature_preds[~np.isnan(feature_targets)] + feature_targets = feature_targets[~np.isnan(feature_targets)] if len(np.unique(feature_targets)) > 0 and \ np.count_nonzero(feature_targets) > report_gt_feature_n_positives: try: - feature_scores[index] = metric_fn( - feature_targets, feature_preds) + output = metric_fn(feature_targets, feature_preds) + if type(output) != float and (metric_fn.__name__ == 'spearmanr' or metric_fn.__name__ == 'pearsonr'): + output = output[0] + feature_scores[index] = output except ValueError: # do I need to make this more generic? continue valid_feature_scores = [s for s in feature_scores if not np.isnan(s)] # Allow 0 or negative values. From dca3ae40f6029f51000e958b19ea3defca057d7f Mon Sep 17 00:00:00 2001 From: Kathy Date: Tue, 7 May 2024 10:40:15 -0400 Subject: [PATCH 44/62] remove files that we have merged the functionality into existing classes --- .../samplers/methylation_data_sampler.py | 410 ------------------ .../utils/methylation_performance_metrics.py | 284 ------------ 2 files changed, 694 deletions(-) delete mode 100644 selene_sdk/samplers/methylation_data_sampler.py delete mode 100644 selene_sdk/utils/methylation_performance_metrics.py diff --git a/selene_sdk/samplers/methylation_data_sampler.py b/selene_sdk/samplers/methylation_data_sampler.py deleted file mode 100644 index d6ba6df8..00000000 --- a/selene_sdk/samplers/methylation_data_sampler.py +++ /dev/null @@ -1,410 +0,0 @@ -""" -This module provides the RandomPositionsSampler class. -""" -from collections import namedtuple -import logging -import random - -import numpy as np - -from functools import wraps -from .online_sampler import OnlineSampler -from ..utils import get_indices_and_probabilities - -logger = logging.getLogger(__name__) - - -SampleIndices = namedtuple( - "SampleIndices", ["indices", "weights"]) -""" -A tuple containing the indices for some samples, and a weight to -allot to each index when randomly drawing from them. - -TODO: this is common to both the intervals sampler and the -random positions sampler. Can we move this to utils or -somewhere else? - -Parameters ----------- -indices : list(int) - The numeric index of each sample. -weights : list(float) - The amount of weight assigned to each sample. - -Attributes ----------- -indices : list(int) - The numeric index of each sample. -weights : list(float) - The amount of weight assigned to each sample. - -""" - - -class MethylationDataSampler(OnlineSampler): - """This sampler randomly selects a position in the genome and queries for - a sequence centered at that position for input to the model. - - TODO: generalize to selene_sdk.sequences.Sequence? - - Parameters - ---------- - reference_sequence : selene_sdk.sequences.Genome - A reference sequence from which to create examples. - targets : GenomicFeaturesH5 - features : list(str) - List of distinct features that we aim to predict. - seed : int, optional - Default is 436. Sets the random seed for sampling. - validation_holdout : list(str) or float, optional - Default is `['chr6', 'chr7']`. Holdout can be regional or - proportional. If regional, expects a list (e.g. `['chrX', 'chrY']`). - Regions must match those specified in the first column of the - tabix-indexed BED file. If proportional, specify a percentage - between (0.0, 1.0). Typically 0.10 or 0.20. - test_holdout : list(str) or float, optional - Default is `['chr8', 'chr9']`. See documentation for - `validation_holdout` for additional information. - sequence_length : int, optional - Default is 1000. Model is trained on sequences of `sequence_length` - where genomic features are annotated to the center regions of - these sequences. - center_bin_to_predict : int, optional - Default is 200. Query the tabix-indexed file for a region of - length `center_bin_to_predict`. - mode : {'train', 'validate', 'test'} - Default is `'train'`. The mode to run the sampler in. - save_datasets : list(str), optional - Default is `['test']`. The list of modes for which we should - save the sampled data to file. - output_dir : str or None, optional - Default is None. The path to the directory where we should - save sampled examples for a mode. If `save_datasets` is - a non-empty list, `output_dir` must be specified. If - the path in `output_dir` does not exist it will be created - automatically. - - Attributes - ---------- - reference_sequence : selene_sdk.sequences.Genome - The reference sequence that examples are created from. - target : selene_sdk.targets.Target - The `selene_sdk.targets.Target` object holding the features that we - would like to predict. - validation_holdout : list(str) or float - The samples to hold out for validating model performance. These - can be "regional" or "proportional". If regional, this is a list - of region names (e.g. `['chrX', 'chrY']`). These regions must - match those specified in the first column of the tabix-indexed - BED file. If proportional, this is the fraction of total samples - that will be held out. - test_holdout : list(str) or float - The samples to hold out for testing model performance. See the - documentation for `validation_holdout` for more details. - sequence_length : int - The length of the sequences to train the model on. - modes : list(str) - The list of modes that the sampler can be run in. - mode : str - The current mode that the sampler is running in. Must be one of - the modes listed in `modes`. - - """ - def __init__(self, - reference_sequence, - target_path, - features, - seed=436, - validation_holdout=['chr6', 'chr7'], - test_holdout=['chr8', 'chr9'], - sequence_length=1000, - center_bin_to_predict=200, - feature_thresholds=0.5, - mode="train", - save_datasets=[], - output_dir=None): - super(MethylationDataSampler, self).__init__( - reference_sequence, - target_path, - features, - seed=seed, - validation_holdout=validation_holdout, - test_holdout=test_holdout, - sequence_length=sequence_length, - center_bin_to_predict=center_bin_to_predict, - feature_thresholds=feature_thresholds, - mode=mode, - save_datasets=save_datasets, - output_dir=output_dir) - - self._sample_from_mode = {} - self._randcache = {} - for mode in self.modes: - self._sample_from_mode[mode] = None - self._randcache[mode] = {"cache_indices": None, "sample_next": 0} - - self.sample_from_intervals = [] - self.interval_lengths = [] - self._initialized = False - - def init(func): - # delay initialization to allow multiprocessing - @wraps(func) - def dfunc(self, *args, **kwargs): - if not self._initialized: - if self._holdout_type == "chromosome": - self._partition_genome_by_chromosome() - else: - self._partition_genome_by_proportion() - - for mode in self.modes: - self._update_randcache(mode=mode) - self._initialized = True - return func(self, *args, **kwargs) - return dfunc - - - def _partition_genome_by_proportion(self): - for chrom, len_chrom in self.reference_sequence.get_chr_lens(): - if '_' in chrom: - continue - if 'X' in chrom or 'Y' in chrom or 'M' in chrom: - continue - self.sample_from_intervals.append( - (chrom, - self.sequence_length, - len_chrom - self.sequence_length)) - self.interval_lengths.append(len_chrom) - n_intervals = len(self.sample_from_intervals) - - select_indices = list(range(n_intervals)) - np.random.shuffle(select_indices) - n_indices_validate = int(n_intervals * self.validation_holdout) - val_indices, val_weights = get_indices_and_probabilities( - self.interval_lengths, select_indices[:n_indices_validate]) - self._sample_from_mode["validate"] = SampleIndices( - val_indices, val_weights) - - if self.test_holdout: - n_indices_test = int(n_intervals * self.test_holdout) - test_indices_end = n_indices_test + n_indices_validate - test_indices, test_weights = get_indices_and_probabilities( - self.interval_lengths, - select_indices[n_indices_validate:test_indices_end]) - self._sample_from_mode["test"] = SampleIndices( - test_indices, test_weights) - - tr_indices, tr_weights = get_indices_and_probabilities( - self.interval_lengths, select_indices[test_indices_end:]) - self._sample_from_mode["train"] = SampleIndices( - tr_indices, tr_weights) - else: - tr_indices, tr_weights = get_indices_and_probabilities( - self.interval_lengths, select_indices[n_indices_validate:]) - self._sample_from_mode["train"] = SampleIndices( - tr_indices, tr_weights) - - def _partition_genome_by_chromosome(self): - for mode in self.modes: - self._sample_from_mode[mode] = SampleIndices([], []) - index = 0 - for (chrom, len_chrom) in self.reference_sequence.get_chr_lens(): - if '_' in chrom: - continue - if 'X' in chrom or 'Y' in chrom or 'M' in chrom: - continue - if chrom in self.validation_holdout: - self._sample_from_mode["validate"].indices.append( - index) - elif self.test_holdout and chrom in self.test_holdout: - self._sample_from_mode["test"].indices.append( - index) - else: - self._sample_from_mode["train"].indices.append( - index) - - self.sample_from_intervals.append( - (chrom, - self.sequence_length, - len_chrom - self.sequence_length)) - self.interval_lengths.append(len_chrom - 2 * self.sequence_length) - index += 1 - - for mode in self.modes: - sample_indices = self._sample_from_mode[mode].indices - indices, weights = get_indices_and_probabilities( - self.interval_lengths, sample_indices) - self._sample_from_mode[mode] = \ - self._sample_from_mode[mode]._replace( - indices=indices, weights=weights) - - def _retrieve(self, chrom, position, strand=None): - bin_start = position - self._start_radius - bin_end = position + self._end_radius - #indicator = self.target.is_positive( - # chrom, bin_start, bin_end) - indicator, retrieved_targets = self.target.get_feature_data( - chrom, bin_start, bin_end) - window_start = position - self._start_window_radius - window_end = position + self._end_window_radius - if window_end - window_start < self.sequence_length: - print(bin_start, bin_end, - self._start_radius, self._end_radius, - self._start_window_radius, self._end_window_radius,) - return None - if strand is None: - strand = self.STRAND_SIDES[random.randint(0, 1)] - retrieved_seq = \ - self.reference_sequence.get_encoding_from_coords( - chrom, window_start, window_end, strand) - - if retrieved_seq.shape[0] == 0: - logger.info("Full sequence centered at {0} position {1} " - "could not be retrieved. Sampling again.".format( - chrom, position)) - return None - elif np.mean(retrieved_seq==0.25) > 0.30: - logger.info("Over 30% of the bases in the sequence centered " - "at {0} position {1} are ambiguous ('N'). " - "Sampling again.".format(chrom, position)) - return None - - if self.mode in self._save_datasets: - feature_indices = ';'.join( - [str(f) for f in np.nonzero(retrieved_targets)[0]]) - self._save_datasets[self.mode].append( - [chrom, - window_start, - window_end, - strand, - feature_indices]) - if len(self._save_datasets[self.mode]) > 200000: - self.save_dataset_to_file(self.mode) - return (retrieved_seq, retrieved_targets, indicator) - - def _update_randcache(self, mode=None): - if not mode: - mode = self.mode - self._randcache[mode]["cache_indices"] = np.random.choice( - self._sample_from_mode[mode].indices, - size=200000, - replace=True, - p=self._sample_from_mode[mode].weights) - self._randcache[mode]["sample_next"] = 0 - - @init - def sample(self, batch_size=1, mode=None): - """ - Randomly draws a mini-batch of examples and their corresponding - labels. - - Parameters - ---------- - batch_size : int, optional - Default is 1. The number of examples to include in the - mini-batch. - mode : str, optional - Default is None. The operating mode that the object should run in. - If None, will use the current mode `self.mode`. - - Returns - ------- - sequences, targets : tuple(numpy.ndarray, numpy.ndarray) - A tuple containing the numeric representation of the - sequence examples and their corresponding labels. The - shape of `sequences` will be - :math:`B \\times L \\times N`, where :math:`B` is - `batch_size`, :math:`L` is the sequence length, and - :math:`N` is the size of the sequence type's alphabet. - The shape of `targets` will be :math:`B \\times F`, - where :math:`F` is the number of features. - - """ - mode = mode if mode else self.mode - sequences = np.zeros((batch_size, self.sequence_length, 4)) - targets = np.zeros((batch_size, self.n_features)) - indicator = np.zeros(batch_size) - n_samples_drawn = 0 - while n_samples_drawn < batch_size: - sample_index = self._randcache[mode]["sample_next"] - if sample_index == len(self._randcache[mode]["cache_indices"]): - self._update_randcache() - sample_index = 0 - - rand_interval_index = \ - self._randcache[mode]["cache_indices"][sample_index] - self._randcache[mode]["sample_next"] += 1 - - chrom, cstart, cend = \ - self.sample_from_intervals[rand_interval_index] - position = np.random.randint(cstart, cend) - - retrieve_output = self._retrieve(chrom, position) - if not retrieve_output: - continue - seq, seq_targets, methyl = retrieve_output - sequences[n_samples_drawn, :, :] = seq - targets[n_samples_drawn, :] = seq_targets - indicator[n_samples_drawn] = methyl - n_samples_drawn += 1 - return (sequences, targets, indicator) - - - def get_data_and_targets(self, batch_size, n_samples=None, mode=None): - """ - This method fetches a subset of the data from the sampler, - divided into batches. This method also allows the user to - specify what operating mode to run the sampler in when fetching - the data. - - Parameters - ---------- - batch_size : int - The size of the batches to divide the data into. - n_samples : int or None, optional - Default is None. The total number of samples to retrieve. - If `n_samples` is None and the mode is `validate`, will - set `n_samples` to 32000; if the mode is `test`, will set - `n_samples` to 640000 if it is None. If the mode is `train` - you must have specified a value for `n_samples`. - mode : str, optional - Default is None. The mode to run the sampler in when - fetching the samples. See - `selene_sdk.samplers.IntervalsSampler.modes` for more - information. If None, will use the current mode `self.mode`. - - Returns - ------- - sequences_and_targets, targets_matrix : \ - tuple(list(tuple(numpy.ndarray, numpy.ndarray)), numpy.ndarray) - Tuple containing the list of sequence-target pairs, as well - as a single matrix with all targets in the same order. - Note that `sequences_and_targets`'s sequence elements are of - the shape :math:`B \\times L \\times N` and its target - elements are of the shape :math:`B \\times F`, where - :math:`B` is `batch_size`, :math:`L` is the sequence length, - :math:`N` is the size of the sequence type's alphabet, and - :math:`F` is the number of features. Further, - `target_matrix` is of the shape :math:`S \\times F`, where - :math:`S =` `n_samples`. - - """ - if mode is not None: - self.set_mode(mode) - else: - mode = self.mode - sequences_and_targets = [] - if n_samples is None and mode == "validate": - n_samples = 32000 - elif n_samples is None and mode == "test": - n_samples = 640000 - - n_batches = int(n_samples / batch_size) - for _ in range(n_batches): - inputs, targets, indicator = self.sample(batch_size) - sequences_and_targets.append((inputs, targets, indicator)) - targets_mat = np.vstack([t for (s, t, i) in sequences_and_targets]) - ind_mat = np.hstack([i for (s, t, i) in sequences_and_targets]) - if mode in self._save_datasets: - self.save_dataset_to_file(mode, close_filehandle=True) - return sequences_and_targets, targets_mat, ind_mat diff --git a/selene_sdk/utils/methylation_performance_metrics.py b/selene_sdk/utils/methylation_performance_metrics.py deleted file mode 100644 index 0a8ea114..00000000 --- a/selene_sdk/utils/methylation_performance_metrics.py +++ /dev/null @@ -1,284 +0,0 @@ -""" -This module provides the `PerformanceMetrics` class and supporting -functionality for tracking and computing model performance. -""" -from collections import defaultdict, namedtuple -import logging -import os - -import numpy as np -from scipy.stats import rankdata - - -logger = logging.getLogger("selene") - - -Metric = namedtuple("Metric", ["fn", "data"]) -""" -A tuple containing a metric function and the results from applying that -metric to some values. - -Parameters ----------- -fn : types.FunctionType - A metric. -data : list(float) - A list holding the results from applying the metric. - -Attributes ----------- -fn : types.FunctionType - A metric. -data : list(float) - A list holding the results from applying the metric. - -""" - - -def compute_score(prediction, target, metric_fn, - report_gt_feature_n_positives=10): - """ - Using a user-specified metric, computes the distance between - two tensors. - - Parameters - ---------- - prediction : numpy.ndarray - Value predicted by user model. - target : numpy.ndarray - True value that the user model was trying to predict. - metric_fn : types.FunctionType - A metric that can measure the distance between the prediction - and target variables. - report_gt_feature_n_positives : int, optional - Default is 10. The minimum number of positive examples for a - feature in order to compute the score for it. - - Returns - ------- - average_score, feature_scores : tuple(float, numpy.ndarray) - A tuple containing the average of all feature scores, and a - vector containing the scores for each feature. If there were - no features meeting our filtering thresholds, will return - `(None, [])`. - """ - feature_scores = np.ones(target.shape[1]) * np.nan - # Deal with the case of multi-class classification, where each example only has one target value but multiple prediction values - if target.shape[1] == 1 and prediction.shape[1] > 1: - prediction = [prediction] - else: - prediction = prediction.T - for index, feature_preds in enumerate(prediction): - feature_targets = target[:, index] - feature_preds = feature_preds[~np.isnan(feature_targets)] - feature_targets = feature_targets[~np.isnan(feature_targets)] - if len(np.unique(feature_targets)) > 0 and \ - np.count_nonzero(feature_targets) > report_gt_feature_n_positives: - try: - output = metric_fn(feature_targets, feature_preds) - if type(output) != float and (metric_fn.__name__ == 'spearmanr' or metric_fn.__name__ == 'pearsonr'): - output = output[0] - feature_scores[index] = output - except ValueError: # do I need to make this more generic? - continue - valid_feature_scores = [s for s in feature_scores if not np.isnan(s)] # Allow 0 or negative values. - if not valid_feature_scores: - return None, feature_scores - average_score = np.average(valid_feature_scores) - return average_score, feature_scores - - -def get_feature_specific_scores(data, get_feature_from_index_fn): - """ - Generates a dictionary mapping feature names to feature scores from - an intermediate representation. - - Parameters - ---------- - data : list(tuple(int, float)) - A list of tuples, where each tuple contains a feature's index - and the score for that feature. - get_feature_from_index_fn : types.FunctionType - A function that takes an index (`int`) and returns a feature - name (`str`). - - Returns - ------- - dict - A dictionary mapping feature names (`str`) to scores (`float`). - If there was no score for a feature, its score will be set to - `None`. - - """ - feature_score_dict = {} - for index, score in enumerate(data): - feature = get_feature_from_index_fn(index) - if not np.isnan(score): - feature_score_dict[feature] = score - else: - feature_score_dict[feature] = None - return feature_score_dict - - -class MethylationPerformanceMetrics(object): - """ - Tracks and calculates metrics to evaluate how closely a model's - predictions match the true values it was designed to predict. - - Parameters - ---------- - get_feature_from_index_fn : types.FunctionType - A function that takes an index (`int`) and returns a feature - name (`str`). - report_gt_feature_n_positives : int, optional - Default is 10. The minimum number of positive examples for a - feature in order to compute the score for it. - metrics : dict - A dictionary that maps metric names (`str`) to metric functions. - By default, this contains `"roc_auc"`, which maps to - `sklearn.metrics.roc_auc_score`, and `"average_precision"`, - which maps to `sklearn.metrics.average_precision_score`. - - - - Attributes - ---------- - skip_threshold : int - The minimum number of positive examples of a feature that must - be included in an update for a metric score to be - calculated for it. - get_feature_from_index : types.FunctionType - A function that takes an index (`int`) and returns a feature - name (`str`). - metrics : dict - A dictionary that maps metric names (`str`) to metric objects - (`Metric`). By default, this contains `"roc_auc"` and - `"average_precision"`. - - """ - - def __init__(self, - get_feature_from_index_fn, - metrics, - report_gt_feature_n_positives=10): - """ - Creates a new object of the `PerformanceMetrics` class. - """ - self.skip_threshold = report_gt_feature_n_positives - self.get_feature_from_index = get_feature_from_index_fn - self.metrics = dict() - for k, v in metrics.items(): - self.metrics[k] = Metric(fn=v, data=[]) - - def add_metric(self, name, metric_fn): - """ - Begins tracking of the specified metric. - - Parameters - ---------- - name : str - The name of the metric. - metric_fn : types.FunctionType - A metric function. - - """ - self.metrics[name] = Metric(fn=metric_fn, data=[]) - - def remove_metric(self, name): - """ - Ends the tracking of the specified metric, and returns the - previous scores associated with that metric. - - Parameters - ---------- - name : str - The name of the metric. - - Returns - ------- - list(float) - The list of feature-specific scores obtained by previous - uses of the specified metric. - - """ - data = self.metrics[name].data - del self.metrics[name] - return data - - def update(self, prediction, target, scores=None): - """ - Evaluates the tracked metrics on a model prediction and its - target value, and adds this to the metric histories. - - Parameters - ---------- - prediction : numpy.ndarray - Value predicted by user model. - target : numpy.ndarray - True value that the user model was trying to predict. - - Returns - ------- - dict - A dictionary mapping each metric names (`str`) to the - average score of that metric across all features - (`float`). - - """ - if scores is None: - scores = list(self.metrics.keys()) - - metric_scores = {} - for name in scores: - metric = self.metrics[name] - avg_score, feature_scores = compute_score( - prediction, target, metric.fn, - report_gt_feature_n_positives=self.skip_threshold) - metric.data.append(feature_scores) - metric_scores[name] = avg_score - return metric_scores - - def write_feature_scores_to_file(self, output_path): - """ - Writes each metric's score for each feature to a specified - file. - - Parameters - ---------- - output_path : str - The path to the output file where performance metrics will - be written. - - Returns - ------- - dict - A dictionary mapping feature names (`str`) to - sub-dictionaries (`dict`). Each sub-dictionary then maps - metric names (`str`) to the score for that metric on the - given feature. If a metric was not evaluated on a given - feature, the score will be `None`. - - """ - feature_scores = defaultdict(dict) - for name, metric in self.metrics.items(): - feature_score_dict = get_feature_specific_scores( - metric.data[-1], self.get_feature_from_index) - for feature, score in feature_score_dict.items(): - if score is None: - feature_scores[feature] = None - else: - feature_scores[feature][name] = score - - metric_cols = [m for m in self.metrics.keys()] - cols = '\t'.join(["class"] + metric_cols) - with open(output_path, 'w+') as file_handle: - file_handle.write("{0}\n".format(cols)) - for feature, metric_scores in feature_scores.items(): - if not metric_scores: - file_handle.write("{0}\t{1}\n".format(feature, "\t".join(["NA"] * len(metric_cols)))) - else: - metric_score_cols = '\t'.join( - ["{0:.4f}".format(s) for s in metric_scores.values()]) - file_handle.write("{0}\t{1}\n".format(feature, - metric_score_cols)) - return feature_scores From 53f498dcacfc7b0802b0afd26149ff27921d8990 Mon Sep 17 00:00:00 2001 From: Kathy Date: Tue, 7 May 2024 11:04:16 -0400 Subject: [PATCH 45/62] clean up indicator code that is no longer used --- selene_sdk/targets/genomic_features_h5.py | 43 +++++++++++------------ 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/selene_sdk/targets/genomic_features_h5.py b/selene_sdk/targets/genomic_features_h5.py index fedef98c..be889094 100644 --- a/selene_sdk/targets/genomic_features_h5.py +++ b/selene_sdk/targets/genomic_features_h5.py @@ -98,29 +98,22 @@ class GenomicFeaturesH5(Target): Attributes ---------- - data : tabix.open - The data stored in a tabix-indexed `*.bed` file. + coords : tabix.open + The coordinates and row index stored in a tabix-indexed `*.bed` file. + data : h5py.File + The matrix of target data corresponding to the coordinates in `coords`. n_targets : int The number of distinct targets. target_index_dict : dict A dictionary mapping target names (`str`) to indices (`int`), where the index is the position of the target in `targets`. - index_target_dict : dict - A dictionary mapping indices (`int`) to target names (`str`), - where the index is the position of the target in the input - targets. - target_thresholds : dict or None - - * `dict` - A dictionary mapping target names (`str`) to thresholds\ - (`float`), where the threshold is the minimum overlap that a\ - target annotation must have with a query region to be\ - considered a positive example of that target. - * `None` - No threshold specifications. Assumes that all targets\ - returned by a tabix query are annotated to the query region. - """ - def __init__(self, tabix_path, h5_path, targets, init_unpicklable=False): + def __init__(self, + tabix_path, + h5_path, + targets, + init_unpicklable=False): """ Constructs a new `GenomicFeaturesH5` object. """ @@ -246,14 +239,18 @@ def get_feature_data(self, chrom, start, end): nans = np.zeros(self.n_targets) * np.nan rows = self._query_tabix(chrom, start, end) if rows is None: - return False, nans + return nans + row_targets = [] for r in rows: - target = int(r[3]) - row_targets.append(self.data[target]) - if len(row_targets) == 0: # query was empty - return False, nans + ix = int(r[3]) + row_targets.append(self.data[ix]) + + if len(row_targets) == 0: + return nans + row_targets = np.vstack(row_targets) if len(row_targets) == 1: - return True, row_targets[0] - return True, np.average(row_targets, axis=0) + return row_targets[0] + + return np.average(row_targets, axis=0) From 437092ebcb2a57a456ce35b4f7201ebf2ef779b3 Mon Sep 17 00:00:00 2001 From: Kathy Date: Tue, 28 May 2024 14:22:02 -0400 Subject: [PATCH 46/62] integrate changes for methylation prediction in training and metrics --- selene_sdk/train_model.py | 13 +++++-------- selene_sdk/utils/performance_metrics.py | 10 ++++++++-- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/selene_sdk/train_model.py b/selene_sdk/train_model.py index ec52eb00..61aca80d 100644 --- a/selene_sdk/train_model.py +++ b/selene_sdk/train_model.py @@ -265,7 +265,6 @@ def __init__(self, if self.use_cuda: self.model.cuda() - self.criterion.cuda() logger.debug("Set modules to use CUDA") self._report_gt_feature_n_positives = report_gt_feature_n_positives @@ -311,6 +310,7 @@ def _load_checkpoint(self, checkpoint_resume): self._start_step = checkpoint["step"] if self._start_step >= self.max_steps: self.max_steps += self._start_step + self.step = self._start_step self._min_loss = checkpoint["min_loss"] self.optimizer.load_state_dict( @@ -327,6 +327,7 @@ def _load_checkpoint(self, checkpoint_resume): def _init_train(self, scheduler_kwargs): self._start_step = 0 + self.step = self._start_step self._train_logger = _metrics_logger( "{0}.train".format(__name__), self.output_dir) self._train_logger.info("loss") @@ -348,7 +349,7 @@ def _init_validate(self): self._validation_logger = _metrics_logger( "{0}.validation".format(__name__), self.output_dir) - self._validation_logger.info("\t".join(["loss"] + + self._validation_logger.info("\t".join(["loss",] + sorted([x for x in self._validation_metrics.metrics.keys()]))) def _init_test(self): @@ -484,8 +485,8 @@ def train(self): targets = torch.Tensor(targets) if self.use_cuda: - inputs = inputs.cuda() - targets = targets.cuda() + inputs = inputs.to('cuda', non_blocking=True) + targets = targets.to('cuda', non_blocking=True) predictions = self.model(inputs.transpose(1, 2)) loss = self.criterion(predictions, targets) @@ -632,10 +633,6 @@ def evaluate(self): test_performance) average_scores["loss"] = average_loss - - self._test_metrics.visualize( - all_predictions, self._all_test_targets, self.output_dir) - return (average_scores, feature_scores_dict) def _save_checkpoint(self, diff --git a/selene_sdk/utils/performance_metrics.py b/selene_sdk/utils/performance_metrics.py index f7f29abf..e0fd0d71 100644 --- a/selene_sdk/utils/performance_metrics.py +++ b/selene_sdk/utils/performance_metrics.py @@ -371,7 +371,7 @@ def remove_metric(self, name): del self.metrics[name] return data - def update(self, prediction, target): + def update(self, prediction, target, scores=None): """ Evaluates the tracked metrics on a model prediction and its target value, and adds this to the metric histories. @@ -382,6 +382,8 @@ def update(self, prediction, target): Value predicted by user model. target : numpy.ndarray True value that the user model was trying to predict. + scores : list(str), optional + Default is None. Specify only a subset of metrics to update. Returns ------- @@ -391,8 +393,12 @@ def update(self, prediction, target): (`float`). """ + if scores is None: + scores = list(self.metrics.keys()) + metric_scores = {} - for name, metric in self.metrics.items(): + for name in scores: + metric = self.metrics[name] avg_score, feature_scores = compute_score( prediction, target, metric.fn, report_gt_feature_n_positives=self.skip_threshold) From 6aabd4e2159c1bcbbfd98e14291a9805cac1199b Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 29 May 2024 12:27:18 -0400 Subject: [PATCH 47/62] incorporate changes from previous PR on config yaml and model file saving --- selene_sdk/utils/config_utils.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/selene_sdk/utils/config_utils.py b/selene_sdk/utils/config_utils.py index b7546583..62508295 100644 --- a/selene_sdk/utils/config_utils.py +++ b/selene_sdk/utils/config_utils.py @@ -9,6 +9,8 @@ from time import strftime import types import random +import shutil +import yaml import numpy as np import torch @@ -16,6 +18,8 @@ from . import _is_lua_trained_model from . import instantiate +from .. import version + def class_instantiate(classobj): """Not used currently, but might be useful later for recursive @@ -111,6 +115,7 @@ def initialize_model(model_configs, train=True, lr=None): module = None if os.path.isdir(import_model_from): + import_model_from = import_model_from.rstrip(os.sep) module = module_from_dir(import_model_from) else: module = module_from_file(import_model_from) @@ -307,6 +312,9 @@ def parse_configs_and_run(configs, """ operations = configs["ops"] + configs["selene_sdk_version"] = version.__version__ + print("Running with selene_sdk version {0}".format(version.__version__)) + if "train" in operations and "lr" not in configs and lr != None: configs["lr"] = float(lr) elif "train" in operations and "lr" in configs and lr != None: @@ -331,11 +339,26 @@ def parse_configs_and_run(configs, if "create_subdirectory" in configs: create_subdirectory = configs["create_subdirectory"] if create_subdirectory: + res = ''.join(random.choices(string.ascii_uppercase + + string.digits, k=8)) current_run_output_dir = os.path.join( - current_run_output_dir, strftime("%Y-%m-%d-%H-%M-%S")) + current_run_output_dir, + '{0}-{1}'.format(strftime("%Y-%m-%d-%H-%M-%S"), res)) os.makedirs(current_run_output_dir) print("Outputs and logs saved to {0}".format( current_run_output_dir)) + configs['output_dir'] = current_run_output_dir + config_out = os.path.join(current_run_output_dir, 'configuration.yaml') + with open(config_out, 'w') as f: + yaml.dump(configs, f, default_flow_style=None) + model_input = configs['model']['path'] + if os.path.isdir(model_input): + shutil.copytree(model_input, + os.path.join(current_run_output_dir, + os.path.basename(import_model_from)), + dirs_exist_ok=True) + else: + shutil.copy(model_input, current_run_output_dir) if "random_seed" in configs: seed = configs["random_seed"] From 947825dbf601b6f1686f35470381940e34564eeb Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 29 May 2024 12:50:25 -0400 Subject: [PATCH 48/62] update pytorch version constraint --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 96546bd3..e8d8cfbb 100644 --- a/setup.py +++ b/setup.py @@ -63,7 +63,7 @@ "scipy", "seaborn", "statsmodels", - "torch>=0.4.1, <=1.9", + "torch>=0.4.1, <=2.2", ], entry_points={ 'console_scripts': [ From eb6edba3a190d628809ab632505e67e10ef13000 Mon Sep 17 00:00:00 2001 From: Kathy Date: Mon, 8 Jul 2024 12:28:03 -0400 Subject: [PATCH 49/62] variable name fix for copying model file / directory --- selene_sdk/utils/config_utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/selene_sdk/utils/config_utils.py b/selene_sdk/utils/config_utils.py index 62508295..c872c42f 100644 --- a/selene_sdk/utils/config_utils.py +++ b/selene_sdk/utils/config_utils.py @@ -27,7 +27,7 @@ class instantiation """ for attr, obj in classobj.__dict__.items(): is_module = getattr(obj, '__module__', None) - if is_module and "selene_sdk" in is_module and attr is not "model": + if is_module and "selene_sdk" in is_module and attr != "model": class_instantiate(obj) classobj.__init__(**classobj.__dict__) @@ -355,10 +355,12 @@ def parse_configs_and_run(configs, if os.path.isdir(model_input): shutil.copytree(model_input, os.path.join(current_run_output_dir, - os.path.basename(import_model_from)), + os.path.basename(model_input)), dirs_exist_ok=True) else: - shutil.copy(model_input, current_run_output_dir) + shutil.copyfile(model_input, + os.path.join(current_run_output_dir, + os.path.basename(model_input))) if "random_seed" in configs: seed = configs["random_seed"] From 59f70f9b0167b923acf6cd71a75c062e57f60444 Mon Sep 17 00:00:00 2001 From: Kathy Date: Mon, 15 Jul 2024 11:04:43 -0400 Subject: [PATCH 50/62] remove train methylation model class --- selene_sdk/train_methylation_model.py | 774 -------------------------- 1 file changed, 774 deletions(-) delete mode 100644 selene_sdk/train_methylation_model.py diff --git a/selene_sdk/train_methylation_model.py b/selene_sdk/train_methylation_model.py deleted file mode 100644 index efde31b4..00000000 --- a/selene_sdk/train_methylation_model.py +++ /dev/null @@ -1,774 +0,0 @@ -""" -This module provides the `TrainModel` class and supporting methods. -""" -import logging -import math -import os -import shutil -from time import strftime -from time import time - -import numpy as np -import torch -import torch.autograd.profiler as profiler -import torch.nn as nn -import torch.nn.functional as F -from torch.optim.lr_scheduler import ReduceLROnPlateau -from scipy.stats import pearsonr -from scipy.stats import spearmanr -from sklearn.metrics import average_precision_score -from sklearn.metrics import roc_auc_score -from sklearn.metrics import mean_squared_error - -from .utils import initialize_logger -from .utils import load_model_from_state_dict -from .utils.methylation_performance_metrics import MethylationPerformanceMetrics - -logger = logging.getLogger("selene") - - -def _metrics_logger(name, out_filepath): - logger = logging.getLogger("{0}".format(name)) - logger.setLevel(logging.INFO) - formatter = logging.Formatter("%(message)s") - file_handle = logging.FileHandler( - os.path.join(out_filepath, "{0}.txt".format(name))) - file_handle.setFormatter(formatter) - logger.addHandler(file_handle) - return logger - - -class TrainMethylationModel(object): - """ - This class ties together the various objects and methods needed to - train and validate a model. - - TrainModel saves a checkpoint model (overwriting it after - `save_checkpoint_every_n_steps`) as well as a best-performing model - (overwriting it after `report_stats_every_n_steps` if the latest - validation performance is better than the previous best-performing - model) to `output_dir`. - - TrainModel also outputs 2 files that can be used to monitor training - as Selene runs: `selene_sdk.train_model.train.txt` (training loss) and - `selene_sdk.train_model.validation.txt` (validation loss & average - ROC AUC). The columns in these files can be used to quickly visualize - training history (e.g. you can use `matplotlib`, `plt.plot(auc_list)`) - and see, for example, whether the model is still improving, if there are - signs of overfitting, etc. - - Parameters - ---------- - model : torch.nn.Module - The model to train. - data_sampler : selene_sdk.samplers.Sampler - The example generator. - optimizer_class : torch.optim.Optimizer - The optimizer to minimize loss with. - optimizer_kwargs : dict - The dictionary of keyword arguments to pass to the optimizer's - constructor. - batch_size : int - Specify the batch size to process examples. Should be a power of 2. - max_steps : int - The maximum number of mini-batches to iterate over. - report_stats_every_n_steps : int - The frequency with which to report summary statistics. You can - set this value to be equivalent to a training epoch - (`n_steps * batch_size`) being the total number of samples - seen by the model so far. Selene evaluates the model on the validation - dataset every `report_stats_every_n_steps` and, if the model obtains - the best performance so far (based on the user-specified loss function), - Selene saves the model state to a file called `best_model.pth.tar` in - `output_dir`. - output_dir : str - The output directory to save model checkpoints and logs in. - save_checkpoint_every_n_steps : int or None, optional - Default is 1000. If None, set to the same value as - `report_stats_every_n_steps` - save_new_checkpoints_after_n_steps : int or None, optional - Default is None. The number of steps after which Selene will - continually save new checkpoint model weights files - (`checkpoint-.pth.tar`) every - `save_checkpoint_every_n_steps`. Before this point, - the file `checkpoint.pth.tar` is overwritten every - `save_checkpoint_every_n_steps` to limit the memory requirements. - n_validation_samples : int or None, optional - Default is `None`. Specify the number of validation samples in the - validation set. If `n_validation_samples` is `None` and the data sampler - used is the `selene_sdk.samplers.IntervalsSampler` or - `selene_sdk.samplers.RandomSampler`, we will retrieve 32000 - validation samples. If `None` and using - `selene_sdk.samplers.MultiSampler`, we will use all - available validation samples from the appropriate data file. - n_test_samples : int or None, optional - Default is `None`. Specify the number of test samples in the test set. - If `n_test_samples` is `None` and - - - the sampler you specified has no test partition, you should not - specify `evaluate` as one of the operations in the `ops` list. - That is, Selene will not automatically evaluate your trained - model on a test dataset, because the sampler you are using does - not have any test data. - - the sampler you use is of type `selene_sdk.samplers.OnlineSampler` - (and the test partition exists), we will retrieve 640000 test - samples. - - the sampler you use is of type - `selene_sdk.samplers.MultiSampler` (and the test partition - exists), we will use all the test samples available in the - appropriate data file. - - cpu_n_threads : int, optional - Default is 1. Sets the number of OpenMP threads used for parallelizing - CPU operations. - use_cuda : bool, optional - Default is `False`. Specify whether a CUDA-enabled GPU is available - for torch to use during training. - data_parallel : bool, optional - Default is `False`. Specify whether multiple GPUs are available - for torch to use during training. - logging_verbosity : {0, 1, 2}, optional - Default is 2. Set the logging verbosity level. - - * 0 - Only warnings will be logged. - * 1 - Information and warnings will be logged. - * 2 - Debug messages, information, and warnings will all be\ - logged. - - checkpoint_resume : str or None, optional - Default is `None`. If `checkpoint_resume` is not None, it should be the - path to a model file generated by `torch.save` that can now be read - using `torch.load`. - use_scheduler : bool, optional - Default is `True`. If `True`, learning rate scheduler is used to - reduce learning rate on plateau. PyTorch ReduceLROnPlateau scheduler - with patience=16 and factor=0.8 is used. Different scheduler parameters - can be specified with `scheduler_kwargs`. - deterministic : bool, optional - Default is `False`. If `True`, will set - `torch.backends.cudnn.deterministic` to True and - `torch.backends.cudnn.benchmark = False`. In Selene CLI, - if `random_seed` is set in the configuration YAML, Selene automatically - passes in `deterministic=True` to the TrainModel class. - scheduler_kwargs : dict, optional - Default is patience=16, verbose=True, and factor=0.8. Set the parameters - for the PyTorch ReduceLROnPlateau scheduler. - stopping_criteria : list or None, optional - Default is `None`. If `stopping_criteria` is not None, it should be a - list specifying how to use early stopping. The first value should be - a str corresponding to one of `metrics`. The second value should be an - int indicating the patience. If the specified metric does not improve - in the given patience (usually corresponding to the number of epochs), - training stops early. - - Attributes - ---------- - model : torch.nn.Module - The model to train. - sampler : selene_sdk.samplers.Sampler - The example generator. - criterion : torch.nn._Loss - The loss function to optimize. - optimizer : torch.optim.Optimizer - The optimizer to minimize loss with. - batch_size : int - The size of the mini-batch to use during training. - max_steps : int - The maximum number of mini-batches to iterate over. - nth_step_report_stats : int - The frequency with which to report summary statistics. - nth_step_save_checkpoint : int - The frequency with which to save a model checkpoint. - use_cuda : bool - If `True`, use a CUDA-enabled GPU. If `False`, use the CPU. - data_parallel : bool - Whether to use multiple GPUs or not. - output_dir : str - The directory to save model checkpoints and logs. - - """ - - def __init__(self, - model, - data_sampler, - optimizer_class, - optimizer_kwargs, - batch_size, - max_steps, - report_stats_every_n_steps, - output_dir, - save_checkpoint_every_n_steps=1000, - save_new_checkpoints_after_n_steps=None, - report_gt_feature_n_positives=10, - n_validation_samples=None, - n_test_samples=None, - cpu_n_threads=1, - use_cuda=False, - data_parallel=False, - logging_verbosity=2, - checkpoint_resume=None, - metrics=dict(roc_auc=roc_auc_score, - average_precision=average_precision_score), - use_scheduler=True, - deterministic=False, - scheduler_kwargs=dict(patience=16, - verbose=True, - factor=0.8), - stopping_criteria=None): - """ - Constructs a new `TrainModel` object. - """ - self.model = model - self.sampler = data_sampler - - # these can be made parameters if desired - self.cls_criterion = nn.BCELoss() - self.reg_criterion = F.mse_loss - - self.optimizer = optimizer_class( - self.model.parameters(), **optimizer_kwargs) - - self.batch_size = batch_size - self.max_steps = max_steps - self.nth_step_report_stats = report_stats_every_n_steps - self.nth_step_save_checkpoint = None - - if not save_checkpoint_every_n_steps: - self.nth_step_save_checkpoint = report_stats_every_n_steps - else: - self.nth_step_save_checkpoint = save_checkpoint_every_n_steps - - self._save_new_checkpoints = save_new_checkpoints_after_n_steps - - os.makedirs(output_dir, exist_ok=True) - self.output_dir = output_dir - - initialize_logger( - os.path.join(self.output_dir, "{0}.log".format(__name__)), - verbosity=logging_verbosity) - - if deterministic: - logger.info("Setting deterministic = True for reproducibility.") - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.benchmark = False - - - logger.info("Training parameters set: batch size {0}, " - "number of steps per 'epoch': {1}, " - "maximum number of steps: {2}".format( - self.batch_size, - self.nth_step_report_stats, - self.max_steps)) - - torch.set_num_threads(cpu_n_threads) - - self.use_cuda = use_cuda - self.data_parallel = data_parallel - - if self.data_parallel: - self.model = nn.DataParallel(model) - logger.debug("Wrapped model in DataParallel") - - if self.use_cuda: - self.model.cuda() - #self.cls_criterion.cuda() - #self.reg_criterion.cuda() - logger.debug("Set modules to use CUDA") - - self._report_gt_feature_n_positives = report_gt_feature_n_positives - self._metrics = dict(cls_average_precision=average_precision_score, - pearsonr=pearsonr, spearmanr=spearmanr) - #reg_mse=mean_squared_error) - self._n_validation_samples = n_validation_samples - self._n_test_samples = n_test_samples - self._use_scheduler = use_scheduler - - self._init_train(scheduler_kwargs) - self._init_validate() - if "test" in self.sampler.modes: - self._init_test() - if checkpoint_resume is not None: - self._load_checkpoint(checkpoint_resume) - - if type(stopping_criteria) is list and len(stopping_criteria) == 2: - stopping_metric, stopping_patience = stopping_criteria - self._early_stopping = True - if stopping_metric in self._metrics: - self._stopping_metric = stopping_metric - self._stopping_patience = stopping_patience - self._stopping_reached = False - else: - logger.warning("Did not recognize stopping metric. Not performing early stopping.") - self._early_stopping = False - else: - self._early_stopping = False - - def _load_checkpoint(self, checkpoint_resume): - checkpoint = torch.load( - checkpoint_resume, - map_location=lambda storage, location: storage) - if "state_dict" not in checkpoint: - raise ValueError( - ("'state_dict' not found in file {0} " - "loaded with method `torch.load`. Selene does not support " - "continued training of models that were not originally " - "trained using Selene.").format(checkpoint_resume)) - - self.model = load_model_from_state_dict( - checkpoint["state_dict"], self.model) - - self._start_step = checkpoint["step"] - if self._start_step >= self.max_steps: - self.max_steps += self._start_step - - self._min_loss = checkpoint["min_loss"] - self.optimizer.load_state_dict( - checkpoint["optimizer"]) - if self.use_cuda: - for state in self.optimizer.state.values(): - for k, v in state.items(): - if isinstance(v, torch.Tensor): - state[k] = v.cuda() - - logger.info( - ("Resuming from checkpoint: step {0}, min loss {1}").format( - self._start_step, self._min_loss)) - - def _init_train(self, scheduler_kwargs): - self._start_step = 0 - self._train_logger = _metrics_logger( - "{0}.train".format(__name__), self.output_dir) - self._train_logger.info("loss") - if self._use_scheduler: - self.scheduler = ReduceLROnPlateau( - self.optimizer, - 'min', - **scheduler_kwargs) - self._time_per_step = [] - self._train_loss = [] - - def _init_validate(self): - self._min_loss = float("inf") # TODO: Should this be set when it is used later? Would need to if we want to train model 2x in one run. - self._create_validation_set(n_samples=self._n_validation_samples) - self._validation_metrics = MethylationPerformanceMetrics( - self.sampler.get_feature_from_index, - report_gt_feature_n_positives=self._report_gt_feature_n_positives, - metrics=self._metrics) - self._validation_logger = _metrics_logger( - "{0}.validation".format(__name__), self.output_dir) - - self._validation_logger.info("\t".join(["loss", "mse", "bce"] + - sorted([x for x in self._validation_metrics.metrics.keys()]))) - - def _init_test(self): - self._test_data = None - self._n_test_samples = self._n_test_samples - self._test_metrics = MethylationPerformanceMetrics( - self.sampler.get_feature_from_index, - report_gt_feature_n_positives=self._report_gt_feature_n_positives, - metrics=self._metrics) - - def _create_validation_set(self, n_samples=None): - """ - Generates the set of validation examples. - - Parameters - ---------- - n_samples : int or None, optional - Default is `None`. The size of the validation set. If `None`, - will use all validation examples in the sampler. - - """ - logger.info("Creating validation dataset.") - t_i = time() - self._validation_data, self._all_validation_targets, self._all_validation_inds = \ - self.sampler.get_validation_set( - self.batch_size, n_samples=n_samples) - t_f = time() - logger.info(("{0} s to load {1} validation examples ({2} validation " - "batches) to evaluate after each training step.").format( - t_f - t_i, - len(self._validation_data) * self.batch_size, - len(self._validation_data))) - - def create_test_set(self): - """ - Loads the set of test samples. - We do not create the test set in the `TrainModel` object until - this method is called, so that we avoid having to load it into - memory until the model has been trained and is ready to be - evaluated. - - """ - logger.info("Creating test dataset.") - t_i = time() - self._test_data, self._all_test_targets, self._all_test_inds = \ - self.sampler.get_test_set( - self.batch_size, n_samples=self._n_test_samples) - t_f = time() - logger.info(("{0} s to load {1} test examples ({2} test batches) " - "to evaluate after all training steps.").format( - t_f - t_i, - len(self._test_data) * self.batch_size, - len(self._test_data))) - np.savez_compressed( - os.path.join(self.output_dir, "test_targets.npz"), - data=self._all_test_targets) - - def _get_batch(self): - """ - Fetches a mini-batch of examples - - Returns - ------- - tuple(numpy.ndarray, numpy.ndarray) - A tuple containing the examples and targets. - - """ - t_i_sampling = time() - batch_sequences, batch_targets, batch_inds = self.sampler.sample( - batch_size=self.batch_size) - t_f_sampling = time() - logger.debug( - ("[BATCH] Time to sample {0} examples: {1} s.").format( - self.batch_size, - t_f_sampling - t_i_sampling)) - return (batch_sequences, batch_targets, batch_inds) - - def _checkpoint(self): - checkpoint_dict = { - "step": self.step, - "arch": self.model.__class__.__name__, - "state_dict": self.model.state_dict(), - "min_loss": self._min_loss, - "optimizer": self.optimizer.state_dict() - } - if self._save_new_checkpoints is not None and \ - self._save_new_checkpoints >= self.step: - checkpoint_filename = "checkpoint-{0}".format( - strftime("%m%d%H%M%S")) - self._save_checkpoint( - checkpoint_dict, False, filename=checkpoint_filename) - logger.debug("Saving checkpoint `{0}.pth.tar`".format( - checkpoint_filename)) - else: - self._save_checkpoint( - checkpoint_dict, False) - - def train_and_validate(self): - """ - Trains the model and measures validation performance. - - """ - for step in range(self._start_step, self.max_steps): - self.step = step - self.train() - - if step % self.nth_step_save_checkpoint == 0: - self._checkpoint() - if self.step and self.step % self.nth_step_report_stats == 0: - self.validate() - if self._early_stopping and self._stopping_reached: - logger.debug("Patience ran out. Stopping early.") - break - - self.sampler.save_dataset_to_file("train", close_filehandle=True) - - def train(self): - """ - Trains the model on a batch of data. - - Returns - ------- - float - The training loss. - - """ - t_i = time() - self.model.train() - self.sampler.set_mode("train") - - inputs, targets, inds = self._get_batch() - inputs = torch.Tensor(inputs) - targets = torch.Tensor(targets) - inds = torch.Tensor(inds) - - if self.use_cuda: - inputs = inputs.cuda() - targets = targets.cuda() - inds = inds.cuda() - - #start = torch.cuda.Event(enable_timing=True) - #end = torch.cuda.Event(enable_timing=True) - - #start.record() - #cls_pred, reg_pred = self.model(inputs.transpose(1, 2)) - pred = self.model(inputs.transpose(1, 2)) - #end.record() - #torch.cuda.synchronize() - #print("model time: {0}".format(start.elapsed_time(end))) - - cls_pred = pred[:, 0] - reg_pred = pred[:, 1:] - cls_loss = self.cls_criterion(cls_pred, inds.squeeze()) - - reg_loss = 0 - inds = inds.ravel() - if len(inds[inds == 1]) != 0: - subset_pred = reg_pred[inds == 1] - subset_tgt = targets[inds == 1] - mask = torch.isnan(subset_tgt) - reg_loss = self.reg_criterion( - subset_pred[~mask], subset_tgt[~mask]) - # TODO: write a custom loss fn based on this? - #reg_loss = self.reg_criterion( - # subset_pred, subset_tgt, reduction='none').nanmean(dim=1).mean() - - self.optimizer.zero_grad() - total_loss = cls_loss #* 0.01 - if torch.is_tensor(reg_loss): - total_loss += reg_loss - total_loss.backward() - self.optimizer.step() - - self._train_loss.append(total_loss.item()) - t_f = time() - - self._time_per_step.append(t_f - t_i) - if self.step and self.step % self.nth_step_report_stats == 0: - logger.info(("[STEP {0}] average number " - "of steps per second: {1:.1f}").format( - self.step, 1. / np.average(self._time_per_step))) - self._train_logger.info(np.average(self._train_loss)) - logger.info("training loss: {0}".format(np.average(self._train_loss))) - self._time_per_step = [] - self._train_loss = [] - - - def _evaluate_on_data(self, data_in_batches): - """ - Makes predictions for some labeled input data. - - Parameters - ---------- - data_in_batches : list(tuple(numpy.ndarray, numpy.ndarray)) - A list of tuples of the data, where the first element is - the example, and the second element is the label. - - Returns - ------- - tuple(float, list(numpy.ndarray)) - Returns the average loss, and the list of all predictions. - - """ - self.model.eval() - - batch_bce, batch_mse, batch_losses = [], [], [] - all_predictions = [] - all_ind_predictions = [] - for (seqs, targets, inds) in data_in_batches: - seqs = torch.Tensor(seqs) - targets = torch.Tensor(targets) - inds = torch.Tensor(inds) - if self.use_cuda: - seqs = seqs.cuda() - targets = targets.cuda() - inds = inds.cuda() - - with torch.no_grad(): - #cls_pred, reg_pred = self.model(seqs.transpose(1, 2)) - pred = self.model(seqs.transpose(1, 2)) - cls_pred = pred[:, 0] - reg_pred = pred[:, 1:] - cls_loss = self.cls_criterion(cls_pred, inds) - - reg_loss = 0 - if len(inds[inds == 1]) != 0: - subset_pred = reg_pred[inds == 1] - subset_tgt = targets[inds == 1] - mask = torch.isnan(subset_tgt) - reg_loss = self.reg_criterion( - subset_pred[~mask], subset_tgt[~mask]) - #subset_pred = reg_pred[inds == 1] - #subset_tgt = targets[inds == 1] - #reg_loss = self.reg_criterion( - # subset_pred, subset_tgt, reduction='none').nanmean(dim=1).mean() - - all_predictions.append(reg_pred) - all_ind_predictions.append(cls_pred) - - batch_bce.append(cls_loss.item()) - total_loss = cls_loss #* 0.01 - if torch.is_tensor(reg_loss): - total_loss += reg_loss - batch_mse.append(reg_loss.item()) - batch_losses.append(total_loss.item()) - all_predictions = torch.vstack(all_predictions) - all_ind_predictions = torch.hstack(all_ind_predictions)[:, None] - all_predictions = all_predictions.data.cpu().numpy() - all_ind_predictions = all_ind_predictions.data.cpu().numpy() - - loss_breakdown = { - 'bce': np.average(batch_bce), - 'mse': np.average(batch_mse), - 'loss': np.average(batch_losses) - } - return loss_breakdown, all_ind_predictions, all_predictions - - def validate(self): - """ - Measures model validation performance. - - Returns - ------- - dict - A dictionary, where keys are the names of the loss metrics, - and the values are the average value for that metric over - the validation set. - - """ - lossdict, val_pred_inds, val_pred_tgts = self._evaluate_on_data( - self._validation_data) - cls_valid_scores = self._validation_metrics.update( - val_pred_inds, self._all_validation_inds.reshape(len(val_pred_inds), 1), - scores=['cls_average_precision']) - reg_valid_scores = self._validation_metrics.update( - val_pred_tgts[self._all_validation_inds == 1], - self._all_validation_targets[self._all_validation_inds == 1], - scores=['pearsonr', 'spearmanr']) - for name, score in cls_valid_scores.items(): - logger.info("validation {0}: {1}".format(name, score)) - lossdict[name] = score - for name, score in reg_valid_scores.items(): - logger.info("validation {0}: {1}".format(name, score)) - lossdict[name] = score - - to_log = [lossdict['loss'], lossdict['mse'], lossdict['bce']] - for k in sorted(self._validation_metrics.metrics.keys()): - if k in lossdict and lossdict[k]: - to_log.append(lossdict[k]) - else: - to_log.append("NA") - self._validation_logger.info('\t'.join([str(s) for s in to_log])) - validation_loss = lossdict['loss'] - # scheduler update - if self._use_scheduler: - self.scheduler.step( - math.ceil(validation_loss * 1000.0) / 1000.0) - - # save best_model - if validation_loss < self._min_loss: - self._min_loss = validation_loss - self._save_checkpoint({ - "step": self.step, - "arch": self.model.__class__.__name__, - "state_dict": self.model.state_dict(), - "min_loss": self._min_loss, - "optimizer": self.optimizer.state_dict()}, True) - logger.debug("Updating `best_model.pth.tar`") - logger.info("validation loss: {0} (bce: {1}, mse: {2})".format( - validation_loss, lossdict['bce'], lossdict['mse'])) - - # check for early stopping - if self._early_stopping: - stopping_metric = self._validation_metrics.metrics[self._stopping_metric].data - index = np.argmax(stopping_metric) - if self._stopping_patience - (len(stopping_metric) - index - 1) <= 0: - self._stopping_reached = True - - def evaluate(self): - """ - Measures the model test performance. - - Returns - ------- - dict - A dictionary, where keys are the names of the loss metrics, - and the values are the average value for that metric over - the test set. - - """ - if self._test_data is None: - self.create_test_set() - losdict, test_pred_inds, test_pred_tgts = self._evaluate_on_data( - self._test_data) - - test_scores = self._test_metrics.update( - test_pred_inds, self._all_test_inds.reshape(len(test_pred_inds), 1), - scores=['cls_average_precision']) - - #reg_test_scores = self._test_metrics.update( - # test_pred_tgts[self._all_test_inds == 1], - # self._all_test_targets[self._all_test_inds == 1], - # scores=['reg_mse']) - - np.savez_compressed( - os.path.join(self.output_dir, "test_predictions.npz"), - data=test_pred_tgts) - np.savez_compressed( - os.path.join(self.output_dir, "test_predictions.indicator.npz"), - data=test_pred_inds) - - for name, score in test_scores.items(): - logger.info("test {0}: {1}".format(name, score)) - lossdict[name] = score - #for name, score in reg_test_scores.items(): - # logger.info("test {0}: {1}".format(name, score)) - # lossdict[name] = score - - test_performance = os.path.join( - self.output_dir, "test_performance.txt") - feature_scores_dict = self._test_metrics.write_feature_scores_to_file( - test_performance) - - return (lossdict, feature_scores_dict) - - def _save_checkpoint(self, - state, - is_best, - filename="checkpoint"): - """ - Saves snapshot of the model state to file. Will save a checkpoint - with name `.pth.tar` and, if this is the model's best - performance so far, will save the state to a `best_model.pth.tar` - file as well. - - Models are saved in the state dictionary format. This is a more - stable format compared to saving the whole model (which is another - option supported by PyTorch). Note that we do save a number of - additional, Selene-specific parameters in the dictionary - and that the actual `model.state_dict()` is stored in the `state_dict` - key of the dictionary loaded by `torch.load`. - - See: https://pytorch.org/docs/stable/notes/serialization.html for more - information about how models are saved in PyTorch. - - Parameters - ---------- - state : dict - Information about the state of the model. Note that this is - not `model.state_dict()`, but rather, a dictionary containing - keys that can be used for continued training in Selene - _in addition_ to a key `state_dict` that contains - `model.state_dict()`. - is_best : bool - Is this the model's best performance so far? - filename : str, optional - Default is "checkpoint". Specify the checkpoint filename. Will - append a file extension to the end of the `filename` - (e.g. `checkpoint.pth.tar`). - - Returns - ------- - None - - """ - logger.debug("[TRAIN] {0}: Saving model state to file.".format( - state["step"])) - cp_filepath = os.path.join( - self.output_dir, filename) - torch.save(state, "{0}.pth.tar".format(cp_filepath)) - if is_best: - best_filepath = os.path.join(self.output_dir, "best_model") - shutil.copyfile("{0}.pth.tar".format(cp_filepath), - "{0}.pth.tar".format(best_filepath)) From 95a234a1262ef70cd2588be701cac23ecfb40d92 Mon Sep 17 00:00:00 2001 From: Kathy Date: Fri, 6 Dec 2024 15:35:57 -0500 Subject: [PATCH 51/62] fix bug in random positions sampler with exclude_chrs, overload targets_path in both sampler classes --- selene_sdk/samplers/online_sampler.py | 15 +++++++++++---- selene_sdk/samplers/random_positions_sampler.py | 10 ++++++++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/selene_sdk/samplers/online_sampler.py b/selene_sdk/samplers/online_sampler.py index 53f21884..ca0b0b69 100644 --- a/selene_sdk/samplers/online_sampler.py +++ b/selene_sdk/samplers/online_sampler.py @@ -25,9 +25,13 @@ class OnlineSampler(Sampler, metaclass=ABCMeta): ---------- reference_sequence : selene_sdk.sequences.Sequence A reference sequence from which to create examples. - target_path : str + target_path : str or selene_sdk.targets.Target Path to tabix-indexed, compressed BED file (`*.bed.gz`) of genomic coordinates mapped to the genomic features we want to predict. + `target_path` will be loaded as a `GenomicFeatures` object. + Currently, `target_path` is also overloaded to accept a + `selene_sdk.targets.Target` object directly, either `GenomicFeatures` + or `GenomicFeaturesH5`. features : list(str) List of distinct features that we aim to predict. seed : int, optional @@ -213,9 +217,12 @@ def __init__(self, self.reference_sequence = reference_sequence self.n_features = len(self._features) - self.target = GenomicFeatures( - target_path, self._features, - feature_thresholds=feature_thresholds) + if isinstance(target_path, str): + self.target = GenomicFeatures( + target_path, self._features, + feature_thresholds=feature_thresholds) + else: + self.target = target_path self._save_filehandles = {} def get_feature_from_index(self, index): diff --git a/selene_sdk/samplers/random_positions_sampler.py b/selene_sdk/samplers/random_positions_sampler.py index dedf2442..fbb27c62 100644 --- a/selene_sdk/samplers/random_positions_sampler.py +++ b/selene_sdk/samplers/random_positions_sampler.py @@ -54,9 +54,13 @@ class RandomPositionsSampler(OnlineSampler): ---------- reference_sequence : selene_sdk.sequences.Genome A reference sequence from which to create examples. - target_path : str + target_path : str or selene_sdk.targets.Target Path to tabix-indexed, compressed BED file (`*.bed.gz`) of genomic coordinates mapped to the genomic features we want to predict. + `target_path` will be loaded as a `GenomicFeatures` object. + Currently, `target_path` is also overloaded to accept a + `selene_sdk.targets.Target` object directly, either `GenomicFeatures` + or `GenomicFeaturesH5`. features : list(str) List of distinct features that we aim to predict. seed : int, optional @@ -222,7 +226,8 @@ def _partition_genome_by_proportion(self): def _partition_genome_by_chromosome(self): for mode in self.modes: self._sample_from_mode[mode] = SampleIndices([], []) - for index, (chrom, len_chrom) in enumerate(self.reference_sequence.get_chr_lens()): + index = 0 + for (chrom, len_chrom) in self.reference_sequence.get_chr_lens(): skip = False for excl in self.exclude_chrs: if excl in chrom: @@ -246,6 +251,7 @@ def _partition_genome_by_chromosome(self): self.sequence_length, len_chrom - self.sequence_length)) self.interval_lengths.append(len_chrom - 2 * self.sequence_length) + index += 1 for mode in self.modes: sample_indices = self._sample_from_mode[mode].indices From 55d30b8e6c55a936cdb41590c1faaab559d4796f Mon Sep 17 00:00:00 2001 From: Kathy Date: Fri, 6 Dec 2024 15:36:10 -0500 Subject: [PATCH 52/62] remove unused method in genomicfeaturesh5 --- selene_sdk/targets/genomic_features_h5.py | 48 +---------------------- 1 file changed, 2 insertions(+), 46 deletions(-) diff --git a/selene_sdk/targets/genomic_features_h5.py b/selene_sdk/targets/genomic_features_h5.py index be889094..9faa4fcd 100644 --- a/selene_sdk/targets/genomic_features_h5.py +++ b/selene_sdk/targets/genomic_features_h5.py @@ -24,50 +24,13 @@ from .target import Target -def _get_target_data(chrom, start, end, - thresholds, target_index_dict, get_target_rows): - """ - Generates a target vector for the given query region. - - Parameters - ---------- - chrom : str - The name of the region (e.g. 'chr1', 'chr2', ..., 'chrX', - 'chrY') to query inside of. - start : int - The 0-based start coordinate of the region to query. - end : int - One past the last coordinate of the region to query. - thresholds : np.ndarray, dtype=numpy.float32 - An array of target thresholds, where the value in position - `i` corresponds to the threshold for the target name that is - mapped to index `i` by `target_index_dict`. - target_index_dict : dict - A dictionary mapping target names (`str`) to indices (`int`), - where the index is the position of the target in `targets`. - get_target_rows : types.FunctionType - A function that takes coordinates and returns rows - (`list(tuple(int, int, str))`). - - Returns - ------- - numpy.ndarray, dtype=int - A target vector where the `i`th position is equal to one if the - `i`th target is positive, and zero otherwise. - - """ - rows = get_target_rows(chrom, start, end) - return _fast_get_target_data( - start, end, thresholds, target_index_dict, rows) - - class GenomicFeaturesH5(Target): """ Stores the dataset specifying sequence regions and targets. Accepts a tabix-indexed `*.bed` file with the following columns, in order: :: - [chrom, start, end, strand, index] + [chrom, start, end, index] and an HDF5 file of the target values in a matrix with key `targets`. @@ -104,9 +67,6 @@ class GenomicFeaturesH5(Target): The matrix of target data corresponding to the coordinates in `coords`. n_targets : int The number of distinct targets. - target_index_dict : dict - A dictionary mapping target names (`str`) to indices (`int`), - where the index is the position of the target in `targets`. """ def __init__(self, @@ -122,10 +82,6 @@ def __init__(self, self.n_targets = len(targets) - self.target_index_dict = dict( - [(feat, index) for index, feat in enumerate(targets)]) - self.index_target_dict = dict(list(enumerate(targets))) - self._initialized = False if init_unpicklable: @@ -243,7 +199,7 @@ def get_feature_data(self, chrom, start, end): row_targets = [] for r in rows: - ix = int(r[3]) + ix = int(r[-1]) row_targets.append(self.data[ix]) if len(row_targets) == 0: From 597f570770496359595de3fadcf520e302083a1d Mon Sep 17 00:00:00 2001 From: Kathy Date: Fri, 6 Dec 2024 16:10:17 -0500 Subject: [PATCH 53/62] adjust strand vs feature (target) column ordering assumptions in tabix-indexed BED file --- selene_sdk/targets/genomic_features.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/selene_sdk/targets/genomic_features.py b/selene_sdk/targets/genomic_features.py index d3964bee..8fe3e593 100644 --- a/selene_sdk/targets/genomic_features.py +++ b/selene_sdk/targets/genomic_features.py @@ -201,10 +201,14 @@ class GenomicFeatures(Target): :: [chrom, start, end, strand, feature] + or, if the targets (genomic features) to predict are strand-agnostic: + :: + [chrom, start, end, feature] + Note that `chrom` is interchangeable with any sort of region (e.g. a protein in a FAA file). Further, `start` is 0-based. Lastly, any - addition columns following the five shown above will be ignored. + addition columns following those shown above will be ignored. Parameters ---------- @@ -337,7 +341,7 @@ def _query_tabix(self, chrom, start, end, strand=None): try: tabix_query = self.data.query(chrom, start, end) if strand == '+' or strand == '-': - return [line for line in tabix_query if str(line[4]) == strand] # strand specificity + return [line for line in tabix_query if str(line[3]) == strand] # strand specificity else: # not strand specific return tabix_query except tabix.TabixError: @@ -405,10 +409,10 @@ def get_feature_data(self, chrom, start, end, strand=None): if not rows: return features for r in rows: - feature = r[3] + feature = r[-1] ix = self.feature_index_dict[feature] features[ix] = 1 return features return _get_feature_data( chrom, start, end, self._feature_thresholds_vec, - self.feature_index_dict, self._query_tabix, strand=strand) \ No newline at end of file + self.feature_index_dict, self._query_tabix, strand=strand) From 34fa342ae7f3800dd23460529f22389d718b3c4f Mon Sep 17 00:00:00 2001 From: Kathy Date: Fri, 6 Dec 2024 16:16:42 -0500 Subject: [PATCH 54/62] adjust descriptions for target classes --- selene_sdk/targets/genomic_features.py | 4 ++-- selene_sdk/targets/genomic_features_h5.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/selene_sdk/targets/genomic_features.py b/selene_sdk/targets/genomic_features.py index 8fe3e593..e498094e 100644 --- a/selene_sdk/targets/genomic_features.py +++ b/selene_sdk/targets/genomic_features.py @@ -5,8 +5,8 @@ It accepts the path to a tabix-indexed .bed.gz file of genomic coordinates. -This .tsv/.bed file must contain the following columns, in order: - chrom ('1', '2', ..., 'X', 'Y'), start (0-based), end, feature +This .bed file must contain the following columns, in order: + chrom, start (0-based), end, feature (target) Additionally, the column names should be omitted from the file itself (i.e. there is no header and the first line in the file is the first row of genome coordinates for a feature). diff --git a/selene_sdk/targets/genomic_features_h5.py b/selene_sdk/targets/genomic_features_h5.py index 9faa4fcd..ee757c4e 100644 --- a/selene_sdk/targets/genomic_features_h5.py +++ b/selene_sdk/targets/genomic_features_h5.py @@ -6,8 +6,8 @@ It accepts the path to a tabix-indexed .bed.gz file of genomic coordinates and the path to an HDF5 file containing the continuous-valued targets as a matrix. -This .tsv/.bed file must contain the following columns, in order: - chrom ('1', '2', ..., 'X', 'Y'), start (0-based), end, index +This .bed file must contain the following columns, in order: + chrom, start (0-based), end, index where the index is the index of the corresponding row in the HDF5 file. Additionally, the column names should be omitted from the file itself From 8d84a178818ca37e1051b7edd7efc4afdc11844a Mon Sep 17 00:00:00 2001 From: Kathy Date: Fri, 6 Dec 2024 16:20:44 -0500 Subject: [PATCH 55/62] minor adjustments to docstrings --- selene_sdk/targets/genomic_features_h5.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/selene_sdk/targets/genomic_features_h5.py b/selene_sdk/targets/genomic_features_h5.py index ee757c4e..c89b3df6 100644 --- a/selene_sdk/targets/genomic_features_h5.py +++ b/selene_sdk/targets/genomic_features_h5.py @@ -37,7 +37,7 @@ class GenomicFeaturesH5(Target): Note that `chrom` is interchangeable with any sort of region (e.g. a protein in a FAA file). Further, `start` is 0-based. The `index` corresponds to the row index of the targets in the HDF5 file. Lastly, any - addition columns following the five shown above will be ignored. + addition columns following those shown above will be ignored. Parameters ---------- @@ -146,6 +146,7 @@ def is_positive(self, chrom, start, end): The 0-based first position in the region. end : int One past the 0-based last position in the region. + Returns ------- bool From fb2cd99d869b0c500d3285d35adba46ffeb1c16d Mon Sep 17 00:00:00 2001 From: Kathy Date: Fri, 6 Dec 2024 16:28:07 -0500 Subject: [PATCH 56/62] tuple output handling for non strand specific --- selene_sdk/utils/non_strand_specific_module.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/selene_sdk/utils/non_strand_specific_module.py b/selene_sdk/utils/non_strand_specific_module.py index 9d0ecd82..ea00bd12 100644 --- a/selene_sdk/utils/non_strand_specific_module.py +++ b/selene_sdk/utils/non_strand_specific_module.py @@ -72,7 +72,12 @@ def forward(self, input): if type(output) == tuple: output1, output2 = output output_from_rev1, output_from_rev2 = output_from_rev - return ((output1 + output_from_rev1) / 2, (output2 + output_from_rev2) / 2) + if self.mode == "mean": + return ((output1 + output_from_rev1) / 2, + (output2 + output_from_rev2) / 2) + else: + return (torch.max(output1, output_from_rev1), + torch.max(output2, output_from_rev2)) else: if self.mode == "mean": return (output + output_from_rev) / 2 From e960ceea21c1c211747c42f56626eccf84023d12 Mon Sep 17 00:00:00 2001 From: Kathy Date: Fri, 6 Dec 2024 16:39:58 -0500 Subject: [PATCH 57/62] line breaks for formatting in performance metrics file --- selene_sdk/utils/performance_metrics.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/selene_sdk/utils/performance_metrics.py b/selene_sdk/utils/performance_metrics.py index e0fd0d71..d3a3a01c 100644 --- a/selene_sdk/utils/performance_metrics.py +++ b/selene_sdk/utils/performance_metrics.py @@ -213,7 +213,9 @@ def compute_score(prediction, target, metric_fn, np.count_nonzero(feature_targets) > report_gt_feature_n_positives: try: output = metric_fn(feature_targets, feature_preds) - if type(output) != float and (metric_fn.__name__ == 'spearmanr' or metric_fn.__name__ == 'pearsonr'): + if type(output) != float and \ + (metric_fn.__name__ == 'spearmanr' or + metric_fn.__name__ == 'pearsonr'): output = output[0] feature_scores[index] = output except ValueError: # do I need to make this more generic? From 3fe05aaa3141a953ebf881dc7be1f99de73f5014 Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 11 Dec 2024 11:08:24 -0500 Subject: [PATCH 58/62] adjust CLI config parsing so that a copy of the input config file is made and saved to the output dir --- selene_sdk/cli.py | 5 +- selene_sdk/utils/__init__.py | 2 + selene_sdk/utils/config_utils.py | 201 ++++++++++++++++++++++--------- 3 files changed, 146 insertions(+), 62 deletions(-) diff --git a/selene_sdk/cli.py b/selene_sdk/cli.py index c3dd21ea..bcff4a5e 100644 --- a/selene_sdk/cli.py +++ b/selene_sdk/cli.py @@ -14,7 +14,7 @@ import click from selene_sdk import __version__ -from selene_sdk.utils import load_path, parse_configs_and_run +from selene_sdk.utils import load_and_parse_configs_and_run @click.command() @@ -23,8 +23,7 @@ @click.option('--lr', type=float, help='If training, the optimizer learning rate', show_default=True) def main(path, lr): """Build the model and trains it using user-specified input data.""" - configs = load_path(path, instantiate=False) - parse_configs_and_run(configs, lr=lr) + load_and_parse_configs_and_run(path, lr=lr) if __name__ == "__main__": diff --git a/selene_sdk/utils/__init__.py b/selene_sdk/utils/__init__.py index f2fcf51b..31d88794 100644 --- a/selene_sdk/utils/__init__.py +++ b/selene_sdk/utils/__init__.py @@ -19,6 +19,7 @@ from .config import instantiate from .config_utils import initialize_model from .config_utils import execute +from .config_utils import load_and_parse_configs_and_run from .config_utils import parse_configs_and_run from .multi_model_wrapper import MultiModelWrapper from .non_strand_specific_module import NonStrandSpecific @@ -31,6 +32,7 @@ "PerformanceMetrics", "load", "load_path", + "load_and_parse_configs_and_run", "instantiate", "get_indices_and_probabilities", "visualize_roc_curves", diff --git a/selene_sdk/utils/config_utils.py b/selene_sdk/utils/config_utils.py index c872c42f..5ed0ad0d 100644 --- a/selene_sdk/utils/config_utils.py +++ b/selene_sdk/utils/config_utils.py @@ -5,6 +5,7 @@ """ import os import importlib +import string import sys from time import strftime import types @@ -12,11 +13,13 @@ import shutil import yaml +import ruamel.yaml import numpy as np import torch from . import _is_lua_trained_model from . import instantiate +from . import load_path from .. import version @@ -256,6 +259,137 @@ def execute(operations, configs, output_dir): analyze_seqs.get_predictions(**predict_info) +def output_config_yaml(input_file, output_file, new_keys): + # Initialize YAML handler with ruamel.yaml to preserve formatting + yaml = ruamel.yaml.YAML() + # More aggressive formatting settings + yaml.preserve_quotes = True + yaml.default_flow_style = False + yaml.indent(mapping=2, sequence=4, offset=2) + yaml.width = 1000 # Prevent automatic wrapping + yaml.allow_unicode = True + yaml.explicit_start = True # Will start document with '---' + yaml.explicit_end = True # Will end document with '...' + # This is important - it tells ruamel.yaml to format block style + yaml.old_indent = True + + # Read the input file + with open(input_file, 'r') as f: + data = yaml.load(f) + + def convert_to_block_style(data): + if isinstance(data, dict): + for k, v in data.items(): + if isinstance(v, (dict, list)): + if isinstance(v, dict): + v.fa.set_block_style() + convert_to_block_style(v) + elif isinstance(data, list): + for item in data: + if isinstance(item, (dict, list)): + convert_to_block_style(item) + convert_to_block_style(data) + + # Add new keys + for key, value in new_keys.items(): + data[key] = value + + # Write to output file + with open(output_file, 'w') as f: + yaml.dump(data, f) + +def add_config_keys(configs, lr=None): + keys = {} + operations = configs["ops"] + keys["selene_sdk_version"] = version.__version__ + print("Running with selene_sdk version {0}".format(version.__version__)) + + if "train" in operations and "lr" not in configs and lr != None: + keys["lr"] = float(lr) + elif "train" in operations and "lr" in configs and lr != None: + print("Warning: learning rate specified in both the " + "configuration dict and this method's `lr` parameter. " + "Using the `lr` value input to `parse_configs_and_run` " + "({0}, not {1}).".format(lr, configs["lr"])) + elif "train" in operations and "lr" not in configs and lr == None: + raise ValueError("Learning rate not specified, cannot " + "fit model. Exiting.") + for k, v in keys.items(): + configs[k] = v + return keys, configs + + +def get_output_dir(configs, create_subdirectory=True): + operations = configs["ops"] + current_run_output_dir = None + if "output_dir" not in configs and \ + ("train" in operations or "evaluate" in operations): + print("No top-level output directory specified. All constructors " + "to be initialized (e.g. Sampler, TrainModel) that require " + "this parameter must have it specified in their individual " + "parameter configuration.") + elif "output_dir" in configs: + current_run_output_dir = configs["output_dir"] + os.makedirs(current_run_output_dir, exist_ok=True) + if "create_subdirectory" in configs: + create_subdirectory = configs["create_subdirectory"] + if create_subdirectory: + res = ''.join(random.choices(string.ascii_uppercase + + string.digits, k=8)) + current_run_output_dir = os.path.join( + current_run_output_dir, + '{0}-{1}'.format(strftime("%Y-%m-%d-%H-%M-%S"), res)) + os.makedirs(current_run_output_dir) + print("Outputs and logs saved to {0}".format( + current_run_output_dir)) + return current_run_output_dir + + +def copy_model(configs, output_dir): + model_input = configs['model']['path'] + if os.path.isdir(model_input): + shutil.copytree(model_input, + os.path.join(output_dir, + os.path.basename(model_input)), + dirs_exist_ok=True) + else: + shutil.copyfile(model_input, + os.path.join(output_dir, + os.path.basename(model_input))) + + +def set_random_seeds(configs): + if "random_seed" in configs: + seed = configs["random_seed"] + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + print("Setting random seed = {0}".format(seed)) + else: + print("Warning: no random seed specified in config file. " + "Using a random seed ensures results are reproducible.") + + +def load_and_parse_configs_and_run(path, create_subdirectory=True, lr=None): + configs = load_path(path, instantiate=False) + keys, configs = add_config_keys(configs, lr=lr) + current_run_output_dir = get_output_dir( + configs, + create_subdirectory=create_subdirectory) + + if current_run_output_dir is not None: + keys['output_dir'] = current_run_output_dir + output_config_yaml( + path, + os.path.join(current_run_output_dir, 'configuration.yml'), + keys) + copy_model(configs, current_run_output_dir) + + set_random_seeds(configs) + execute(configs['ops'], configs, current_run_output_dir) + + def parse_configs_and_run(configs, create_subdirectory=True, lr=None): @@ -310,67 +444,16 @@ def parse_configs_and_run(configs, to the dirs specified in each operation's configuration. """ - operations = configs["ops"] - - configs["selene_sdk_version"] = version.__version__ - print("Running with selene_sdk version {0}".format(version.__version__)) - - if "train" in operations and "lr" not in configs and lr != None: - configs["lr"] = float(lr) - elif "train" in operations and "lr" in configs and lr != None: - print("Warning: learning rate specified in both the " - "configuration dict and this method's `lr` parameter. " - "Using the `lr` value input to `parse_configs_and_run` " - "({0}, not {1}).".format(lr, configs["lr"])) - elif "train" in operations and "lr" not in configs and lr == None: - raise ValueError("Learning rate not specified, cannot " - "fit model. Exiting.") + keys, configs = add_config_keys(configs, lr=lr) + current_run_output_dir = get_output_dir( + configs, create_subdirectory=create_subdirectory) - current_run_output_dir = None - if "output_dir" not in configs and \ - ("train" in operations or "evaluate" in operations): - print("No top-level output directory specified. All constructors " - "to be initialized (e.g. Sampler, TrainModel) that require " - "this parameter must have it specified in their individual " - "parameter configuration.") - elif "output_dir" in configs: - current_run_output_dir = configs["output_dir"] - os.makedirs(current_run_output_dir, exist_ok=True) - if "create_subdirectory" in configs: - create_subdirectory = configs["create_subdirectory"] - if create_subdirectory: - res = ''.join(random.choices(string.ascii_uppercase + - string.digits, k=8)) - current_run_output_dir = os.path.join( - current_run_output_dir, - '{0}-{1}'.format(strftime("%Y-%m-%d-%H-%M-%S"), res)) - os.makedirs(current_run_output_dir) - print("Outputs and logs saved to {0}".format( - current_run_output_dir)) + if current_run_output_dir is not None: configs['output_dir'] = current_run_output_dir config_out = os.path.join(current_run_output_dir, 'configuration.yaml') with open(config_out, 'w') as f: - yaml.dump(configs, f, default_flow_style=None) - model_input = configs['model']['path'] - if os.path.isdir(model_input): - shutil.copytree(model_input, - os.path.join(current_run_output_dir, - os.path.basename(model_input)), - dirs_exist_ok=True) - else: - shutil.copyfile(model_input, - os.path.join(current_run_output_dir, - os.path.basename(model_input))) - - if "random_seed" in configs: - seed = configs["random_seed"] - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) - print("Setting random seed = {0}".format(seed)) - else: - print("Warning: no random seed specified in config file. " - "Using a random seed ensures results are reproducible.") + yaml.dump(configs, f, default_flow_style=False) + copy_model(configs, current_run_output_dir) - execute(operations, configs, current_run_output_dir) + set_random_seeds(configs) + execute(configs['ops'], configs, current_run_output_dir) From 743c2db50634ed3f733f3b95a0d5347d1cb4ad99 Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 11 Dec 2024 12:17:10 -0500 Subject: [PATCH 59/62] update versioning --- selene_sdk/utils/config_utils.py | 4 ++++ selene_sdk/version.py | 2 +- setup.py | 3 ++- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/selene_sdk/utils/config_utils.py b/selene_sdk/utils/config_utils.py index 5ed0ad0d..bd77186b 100644 --- a/selene_sdk/utils/config_utils.py +++ b/selene_sdk/utils/config_utils.py @@ -444,6 +444,10 @@ def parse_configs_and_run(configs, to the dirs specified in each operation's configuration. """ + print("Running `parse_configs_and_run`. For improved reproducibility " + "we would recommend using `load_and_parse_configs_and_run` which " + "will save a format-preserving copy of your input configuration " + "file.") keys, configs = add_config_keys(configs, lr=lr) current_run_output_dir = get_output_dir( configs, create_subdirectory=create_subdirectory) diff --git a/selene_sdk/version.py b/selene_sdk/version.py index 43a1e95b..906d362f 100644 --- a/selene_sdk/version.py +++ b/selene_sdk/version.py @@ -1 +1 @@ -__version__ = "0.5.3" +__version__ = "0.6.0" diff --git a/setup.py b/setup.py index cf8140b8..67210fc2 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ cmdclass = {'build_ext': build_ext} setup(name="selene-sdk", - version="0.5.3", + version="0.6.0", long_description=long_description, long_description_content_type='text/markdown', description=("framework for developing sequence-level " @@ -60,6 +60,7 @@ "pyfaidx", "pytabix", "pyyaml>=5.1", + "ruamel.yaml", "scikit-learn", "scipy", "seaborn", From 7269d791a119bc630c9daefbe1c3a8803de81133 Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 11 Dec 2024 12:39:24 -0500 Subject: [PATCH 60/62] add new dependency --- selene-cpu.yml | 3 ++- selene-gpu.yml | 11 ++++++----- selene_sdk/utils/config_utils.py | 1 + 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/selene-cpu.yml b/selene-cpu.yml index 32a6e1be..89c59eab 100644 --- a/selene-cpu.yml +++ b/selene-cpu.yml @@ -11,8 +11,9 @@ dependencies: - matplotlib=2.0.2 - numpy=1.21.4 - pandas=0.20.3 - - python=3.6.2 + - python=3.9 - pyyaml=5.1 + - ruamel.yaml=0.18.6 - scikit-learn=0.19.0 - scipy=1.1.0 - seaborn=0.8.1 diff --git a/selene-gpu.yml b/selene-gpu.yml index 9a47bb40..c42c3c87 100644 --- a/selene-gpu.yml +++ b/selene-gpu.yml @@ -4,7 +4,7 @@ channels: - bioconda - conda-forge dependencies: -- cython=0.29.3 +- cython=0.29.24 - click==7.1.2 - docopt=0.6.2 - h5py=2.9.0 @@ -15,11 +15,12 @@ dependencies: - statsmodels=0.9.0 - pytabix=0.0.2 - matplotlib=2.2.2 -- python=3.6.5 -- numpy=1.15.1 +- python=3.9 +- ruamel.yaml=0.18.6 +- numpy=1.21.4 - plotly=2.7.0 - cudatoolkit=10.0.130=0 -- pytorch=1.0.1=py3.6_cuda10.0.130_cudnn7.4.2_2 -- torchvision=0.2.2=py_3 +- pytorch=2.4.1=py3.9_cuda11.8_cudnn9.1.0_0 +- torchvision=0.20.0=py39_cu118 - pyfaidx=0.5.5.2 - seaborn=0.8.1 diff --git a/selene_sdk/utils/config_utils.py b/selene_sdk/utils/config_utils.py index bd77186b..f983eaf4 100644 --- a/selene_sdk/utils/config_utils.py +++ b/selene_sdk/utils/config_utils.py @@ -298,6 +298,7 @@ def convert_to_block_style(data): with open(output_file, 'w') as f: yaml.dump(data, f) + def add_config_keys(configs, lr=None): keys = {} operations = configs["ops"] From 7b09b6a4ff1cb1d9491f7f033ce2153070c4e370 Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 11 Dec 2024 13:14:54 -0500 Subject: [PATCH 61/62] adjustment in --- selene_sdk/samplers/file_samplers/mat_file_sampler.py | 9 ++++----- selene_sdk/samplers/online_sampler.py | 7 ++++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/selene_sdk/samplers/file_samplers/mat_file_sampler.py b/selene_sdk/samplers/file_samplers/mat_file_sampler.py index 393764a3..9867d79c 100644 --- a/selene_sdk/samplers/file_samplers/mat_file_sampler.py +++ b/selene_sdk/samplers/file_samplers/mat_file_sampler.py @@ -240,13 +240,12 @@ def get_data_and_targets(self, batch_size, n_samples=None): sequences_and_targets = [] targets_mat = [] - count = 0 - while count < n_samples: - sample_size = min(n_samples - count, batch_size) - seqs, tgts = self.sample(batch_size=sample_size) + for ix in range(0, n_samples, batch_size): + s = ix + e = min(ix+batch_size, n_samples) + seqs, tgts = self.sample(batch_size=e-s) sequences_and_targets.append((seqs, tgts)) targets_mat.append(tgts) - count += sample_size # TODO: should not assume targets are always integers targets_mat = np.vstack(targets_mat).astype(float) diff --git a/selene_sdk/samplers/online_sampler.py b/selene_sdk/samplers/online_sampler.py index ca0b0b69..3aa5405c 100644 --- a/selene_sdk/samplers/online_sampler.py +++ b/selene_sdk/samplers/online_sampler.py @@ -343,9 +343,10 @@ def get_data_and_targets(self, batch_size, n_samples=None, mode=None): elif n_samples is None and mode == "test": n_samples = 640000 - n_batches = int(n_samples / batch_size) - for _ in range(n_batches): - inputs, targets = self.sample(batch_size) + for ix in range(0, n_samples, batch_size): + s = ix + e = min(ix+batch_size, n_samples) + inputs, targets = self.sample(e-s) sequences_and_targets.append((inputs, targets)) targets_mat = np.vstack([t for (s, t) in sequences_and_targets]) if mode in self._save_datasets: From 74477908f26c17533efbe2191cc5ea6bd5e789b5 Mon Sep 17 00:00:00 2001 From: Kathy Date: Wed, 11 Dec 2024 13:20:03 -0500 Subject: [PATCH 62/62] update release notes for 0.6.0 --- RELEASE_NOTES.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index b9b907c6..425c9c15 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -2,6 +2,26 @@ This is a document describing new functionality, bug fixes, breaking changes, etc. associated with Selene version releases from v0.5.0 onwards. +## Version 0.6.0 +- `config_utils.py`: Add additional information saved upon running Selene. Specifically, we now save the version of Selene that the latest run used, make a copy of the input configuration file, and save this along with the model architecture file in the output directory. This adds a new dependency to Selene, the package `ruamel.yaml` +- `H5Dataloader` and `_H5Dataset`: Previously `H5Dataloader` had a number of arguments that were used to then initialize `_H5Dataset` internally. One major change in this version is that we now ask that users initialize `_H5Dataset` explicitly and then pass it to `H5Dataloader` as a class argument. This makes the two classes consistent with the PyTorch specifications for `Dataset` and `DataLoader` classes, enabling them to be compatible with different data parallelization configurations supported by PyTorch and the PyTorch Lightning framework. +- `_H5Dataset` class initialization optional arguments: + - `unpackbits` can now be specified separately for sequences and targets by way of `unpackbits_seq` and `unpackbits_tgt` + - `use_seq_len` enables subsetting to the center `use_seq_len` length of the sequences in the dataset. + - `shift` (particularly paired with `use_seq_len`) allows for retrieving sequences shifted from the center position by `shift` bases. Note currently `shift` only allows shifting in one direction, depending on whether you pass in a positive or negative integer. +- `GenomicFeaturesH5`: This is a new targets class to handle continuous-valued targets, stored in an HDF5 file, that can be retrieved based on genomic coordinate. Once again, genomic regions are stored in a tabix-indexed .bed file, with the main change being that the BED file now specifies for each genomic regions the index of the row in the HDF5 matrix that contains all the target values to predict. If multiple target rows are returned for a query region, the average of those rows is returned. +- `RandomPositionsSampler`: + - `exclude_chrs`: Added a new optional argument which by default excludes all nonstandard chromosomes `exclude_chrs=['_']` by ignoring all chromosomes with an underscore in the name. Pass in a list of chromosomes or substrings to exclude. When loading possible sampling positions, the class now iterates through the `exclude_chrs` list and checks for each substring `s` in list if `s in chrom`, and if so, skips that chromosome entirely. + - Internal function `_retrieve` now takes in an optional argument `strand` (default `None`) to enable explicit retrieval of a sequence at `chrom, position` for a specific side. The default behavior of the `RandomPositionsSampler` class remains the same, with the strand side randomly selected for each genomic position sampled. +- `PerformanceMetrics`: + - Now supports `spearmanr` and `pearsonr` from `scipy.stats`. Room for improvement to generalize this class in the future. + - The `update` function now has an optional argument `scores` which can pass in a subset of the metrics as `list(str)` to compute. +- `TrainModel`: + - `self.step` starts from `self._start_step`, which is non-zero if loaded from a Selene-saved checkpoint + - removed call to `self._test_metrics.visualize` in `evaluate` since the visualize method does not generalize well. +- `NonStrandSpecific`: Can now handle a model outputting two outputs in `forward`, will handle by taking either the mean or max of each of the two individual outputs for their forward and reverse predictions. A custom `NonStrandSpecific` class is recommended for more specific cases. + + ## Version 0.5.3 - Adjust dependency requirements