FunctionLab · kathyxchen · Dec 16, 2024 · Dec 6, 2022 · Dec 15, 2022 · Dec 16, 2022
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -2,6 +2,26 @@
 
 This is a document describing new functionality, bug fixes, breaking changes, etc. associated with Selene version releases from v0.5.0 onwards. 
 
+## Version 0.6.0
+- `config_utils.py`: Add additional information saved upon running Selene. Specifically, we now save the version of Selene that the latest run used, make a copy of the input configuration file, and save this along with the model architecture file in the output directory. This adds a new dependency to Selene, the package `ruamel.yaml`
+- `H5Dataloader` and `_H5Dataset`: Previously `H5Dataloader` had a number of arguments that were used to then initialize `_H5Dataset` internally. One major change in this version is that we now ask that users initialize `_H5Dataset` explicitly and then pass it to `H5Dataloader` as a class argument. This makes the two classes consistent with the PyTorch specifications for `Dataset` and `DataLoader` classes, enabling them to be compatible with different data parallelization configurations supported by PyTorch and the PyTorch Lightning framework.
+- `_H5Dataset` class initialization optional arguments:
+	- `unpackbits` can now be specified separately for sequences and targets by way of `unpackbits_seq` and `unpackbits_tgt`
+	- `use_seq_len` enables subsetting to the center `use_seq_len` length of the sequences in the dataset.
+	- `shift` (particularly paired with `use_seq_len`) allows for retrieving sequences shifted from the center position by `shift` bases. Note currently `shift` only allows shifting in one direction, depending on whether you pass in a positive or negative integer.
+- `GenomicFeaturesH5`: This is a new targets class to handle continuous-valued targets, stored in an HDF5 file, that can be retrieved based on genomic coordinate. Once again, genomic regions are stored in a tabix-indexed .bed file, with the main change being that the BED file now specifies for each genomic regions the index of the row in the HDF5 matrix that contains all the target values to predict. If multiple target rows are returned for a query region, the average of those rows is returned.
+- `RandomPositionsSampler`:
+	- `exclude_chrs`: Added a new optional argument which by default excludes all nonstandard chromosomes `exclude_chrs=['_']` by ignoring all chromosomes with an underscore in the name. Pass in a list of chromosomes or substrings to exclude. When loading possible sampling positions, the class now iterates through the `exclude_chrs` list and checks for each substring `s` in list if `s in chrom`, and if so, skips that chromosome entirely.
+	- Internal function `_retrieve` now takes in an optional argument `strand` (default `None`) to enable explicit retrieval of a sequence at `chrom, position` for a specific side. The default behavior of the `RandomPositionsSampler` class remains the same, with the strand side randomly selected for each genomic position sampled.
+- `PerformanceMetrics`:
+	- Now supports `spearmanr` and `pearsonr` from `scipy.stats`. Room for improvement to generalize this class in the future.
+	- The `update` function now has an optional argument `scores` which can pass in a subset of the metrics as `list(str)` to compute.
+- `TrainModel`:
+	- `self.step` starts from `self._start_step`, which is non-zero if loaded from a Selene-saved checkpoint
+	- removed call to `self._test_metrics.visualize` in `evaluate` since the visualize method does not generalize well.
+- `NonStrandSpecific`: Can now handle a model outputting two outputs in `forward`, will handle by taking either the mean or max of each of the two individual outputs for their forward and reverse predictions. A custom `NonStrandSpecific` class is recommended for more specific cases.
+
+
 ## Version 0.5.3
 - Adjust dependency requirements 
 

diff --git a/scripts/update_seqweaver.py b/scripts/update_seqweaver.py
@@ -13,15 +13,14 @@
 from selene_sdk.targets.genomic_features import GenomicFeatures
 from selene_sdk.samplers.dataloader import H5DataLoader
 from selene_sdk.train_model import TrainModel
-from selene_sdk.utils.config import load_path
-from selene_sdk.utils.config_utils import parse_configs_and_run
+from selene_sdk.utils import load_and_parse_configs_and_run
 
 class UpdateSeqweaver():
     """
     Stores a dataset specifying sequence regions and features.
     Accepts a tabix-indexed `*.bed` file with the following columns,
     in order:
-        [chrom, start, end, feature, strand]
+        [chrom, start, end, strand, feature]
 
     Parameters
     ----------
@@ -40,7 +39,15 @@ class UpdateSeqweaver():
         should contain the target organism's genome sequence.
 
     """
-    def __init__(self, input_path, train_path, validate_path, feature_path, hg_fasta, yaml_path, val_prop=0.1, sequence_len=1000):
+    def __init__(self,
+                 input_path,
+                 train_path,
+                 validate_path,
+                 feature_path,
+                 hg_fasta,
+                 yaml_path,
+                 val_prop=0.1,
+                 sequence_len=1000):
         """
         Constructs a new `UpdateSeqweaver` object.
         """
@@ -142,11 +149,6 @@ def construct_training_data(self):
             fh.create_dataset("train_sequences", data=np.array(training_seqs, dtype=np.int64))
             fh.create_dataset("train_targets", data=np.array(training_labels, dtype=np.int64))
 
-    def _load_yaml(self):
-        # load yaml configuration
-        return load_path(self.yaml_path)
-
-    def train_model(self):
+    def train_model(self, lr):
         # load config file and train model
-        yaml_config = self._load_yaml()
-        parse_configs_and_run(yaml_config)
+        load_and_parse_configs_and_run(self.yaml_path, lr=lr)
diff --git a/selene-cpu.yml b/selene-cpu.yml
@@ -11,8 +11,9 @@ dependencies:
   - matplotlib=2.0.2
   - numpy=1.21.4
   - pandas=0.20.3
-  - python=3.6.2
+  - python=3.9
   - pyyaml=5.1
+  - ruamel.yaml=0.18.6
   - scikit-learn=0.19.0
   - scipy=1.1.0
   - seaborn=0.8.1

diff --git a/selene-gpu.yml b/selene-gpu.yml
@@ -4,7 +4,7 @@ channels:
 - bioconda
 - conda-forge
 dependencies:
-- cython=0.29.3
+- cython=0.29.24
 - click==7.1.2
 - docopt=0.6.2
 - h5py=2.9.0
@@ -15,11 +15,12 @@ dependencies:
 - statsmodels=0.9.0
 - pytabix=0.0.2
 - matplotlib=2.2.2
-- python=3.6.5
-- numpy=1.15.1
+- python=3.9
+- ruamel.yaml=0.18.6
+- numpy=1.21.4
 - plotly=2.7.0
 - cudatoolkit=10.0.130=0
-- pytorch=1.0.1=py3.6_cuda10.0.130_cudnn7.4.2_2
-- torchvision=0.2.2=py_3
+- pytorch=2.4.1=py3.9_cuda11.8_cudnn9.1.0_0
+- torchvision=0.20.0=py39_cu118
 - pyfaidx=0.5.5.2
 - seaborn=0.8.1
diff --git a/selene_sdk/cli.py b/selene_sdk/cli.py
@@ -14,7 +14,7 @@
 import click
 
 from selene_sdk import __version__
-from selene_sdk.utils import load_path, parse_configs_and_run
+from selene_sdk.utils import load_and_parse_configs_and_run
 
 
 @click.command()
@@ -23,8 +23,7 @@
 @click.option('--lr', type=float, help='If training, the optimizer learning rate', show_default=True)
 def main(path, lr):
     """Build the model and trains it using user-specified input data."""
-    configs = load_path(path, instantiate=False)
-    parse_configs_and_run(configs, lr=lr)
+    load_and_parse_configs_and_run(path, lr=lr)
 
 
 if __name__ == "__main__":

diff --git a/selene_sdk/evaluate_model.py b/selene_sdk/evaluate_model.py
@@ -253,9 +253,6 @@ def evaluate(self):
         average_scores = self._metrics.update(
             all_predictions, self._all_test_targets)
 
-        self._metrics.visualize(
-            all_predictions, self._all_test_targets, self.output_dir)
-
         np.savez_compressed(
             os.path.join(self.output_dir, "test_predictions.npz"),
             data=all_predictions)
@@ -270,4 +267,7 @@ def evaluate(self):
         feature_scores_dict = self._metrics.write_feature_scores_to_file(
             test_performance)
 
+        self._metrics.visualize(
+            all_predictions, self._all_test_targets, self.output_dir)
+
         return feature_scores_dict
diff --git a/selene_sdk/samplers/dataloader.py b/selene_sdk/samplers/dataloader.py
@@ -3,6 +3,7 @@
 which allow parallel sampling for any Sampler using
 torch DataLoader mechanism.
 """
+import random
 import  sys
 
 import h5py
@@ -125,6 +126,27 @@ def worker_init_fn(worker_id):
         self.seed = seed
 
 
+def unpackbits_sequence(sequence, s_len):
+    sequence = np.unpackbits(sequence.astype(np.uint8), axis=-2)
+    nulls = np.sum(sequence, axis=-1) == sequence.shape[-1]
+    sequence = sequence.astype(float)
+    sequence[nulls, :] = 1.0 / sequence.shape[-1]
+    if sequence.ndim == 3:
+        sequence = sequence[:, :s_len, :]
+    else:
+        sequence = sequence[:s_len, :]
+    return sequence
+
+
+def unpackbits_targets(targets, t_len):
+    targets = np.unpackbits(targets, axis=-1).astype(float)
+    if targets.ndim == 2:
+        targets = targets[:, :t_len]
+    else:
+        targets = targets[:self.t_len]
+    return targets
+
+
 class _H5Dataset(Dataset):
     """
     This class provides a Dataset that directly loads sequences and targets
@@ -160,13 +182,24 @@ class _H5Dataset(Dataset):
     def __init__(self,
                  file_path,
                  in_memory=False,
-                 unpackbits=False,
+                 unpackbits=False,  # implies unpackbits for both
+                 unpackbits_seq=False,
+                 unpackbits_tgt=False,
                  sequence_key="sequences",
-                 targets_key="targets"):
+                 targets_key="targets",
+                 use_seq_len=None,
+                 shift=None):
         super(_H5Dataset, self).__init__()
         self.file_path = file_path
         self.in_memory = in_memory
+
         self.unpackbits = unpackbits
+        self.unpackbits_seq = unpackbits_seq
+        self.unpackbits_tgt = unpackbits_tgt
+
+        self.use_seq_len = use_seq_len
+        self.shift = shift
+        self._seq_start, self._seq_end = None, None
 
         self._initialized = False
         self._sequence_key = sequence_key
@@ -178,15 +211,22 @@ def init(func):
         def dfunc(self, *args, **kwargs):
             if not self._initialized:
                 self.db = h5py.File(self.file_path, 'r')
+
                 if self.unpackbits:
                     self.s_len = self.db['{0}_length'.format(self._sequence_key)][()]
                     self.t_len = self.db['{0}_length'.format(self._targets_key)][()]
+                elif self.unpackbits_seq:
+                    self.s_len = self.db['{0}_length'.format(self._sequence_key)][()]
+                elif self.unpackbits_tgt:
+                    self.t_len = self.db['{0}_length'.format(self._targets_key)][()]
+
                 if self.in_memory:
                     self.sequences = np.asarray(self.db[self._sequence_key])
                     self.targets = np.asarray(self.db[self._targets_key])
                 else:
                     self.sequences = self.db[self._sequence_key]
                     self.targets = self.db[self._targets_key]
+
                 self._initialized = True
             return func(self, *args, **kwargs)
         return dfunc
@@ -195,25 +235,33 @@ def dfunc(self, *args, **kwargs):
     def __getitem__(self, index):
         if isinstance(index, int):
             index = index % self.sequences.shape[0]
-        sequence = self.sequences[index, :, :]
-        targets = self.targets[index, :]
+        sequence = self.sequences[index]
+        targets = self.targets[index]
+
         if self.unpackbits:
-            sequence = np.unpackbits(sequence, axis=-2)
-            nulls = np.sum(sequence, axis=-1) == sequence.shape[-1]
-            sequence = sequence.astype(float)
-            sequence[nulls, :] = 1.0 / sequence.shape[-1]
-            targets = np.unpackbits(
-                targets, axis=-1).astype(float)
-            if sequence.ndim == 3:
-                sequence = sequence[:, :self.s_len, :]
-            else:
-                sequence = sequence[:self.s_len, :]
-            if targets.ndim == 2:
-                targets = targets[:, :self.t_len]
-            else:
-                targets = targets[:self.t_len]
-        return (torch.from_numpy(sequence.astype(np.float32)),
-                torch.from_numpy(targets.astype(np.float32)))
+            sequence = unpackbits_sequence(sequence, self.s_len)
+            targets = unpackbits_targets(targets, self.t_len)
+        elif self.unpackbits_seq:
+            sequence = unpackbits_sequence(sequence, self.s_len)
+        elif self.unpackbits_tgt:
+            targets = unpackbits_targets(targets, self.t_len)
+
+        if self._seq_start is None:
+            self._seq_start = 0
+            self._seq_end = len(sequence)
+
+            if self.use_seq_len is not None:
+                mid = len(sequence) // 2
+                self._seq_start = int(mid - np.ceil(self.use_seq_len / 2))
+                self._seq_end = mid + self.use_seq_len // 2
+                if self.shift is not None:
+                    self._seq_start += self.shift
+                    self._seq_end += self.shift
+        sequence = sequence[self._seq_start:self._seq_end]
+
+        s = sequence.astype(np.float32)
+        return (torch.from_numpy(s), torch.from_numpy(targets))
+
 
     @init
     def __len__(self):
@@ -288,20 +336,38 @@ class H5DataLoader(DataLoader):
 
     """
     def __init__(self,
-                 filepath,
-                 in_memory=False,
+                 dataset,
                  num_workers=1,
                  use_subset=None,
                  batch_size=1,
-                 shuffle=True,
-                 unpackbits=False,
-                 sequence_key="sequences",
-                 targets_key="targets"):
+                 seed=436,
+                 sampler=None,
+                 batch_sampler=None,
+                 shuffle=True):
+        g = torch.Generator()
+        g.manual_seed(seed)
+
+        def worker_init_fn(worker_id):
+            worker_seed = torch.initial_seed() % 2**32
+            print("Worker seed", worker_seed)
+            np.random.seed(worker_seed)
+            random.seed(worker_seed)
+            torch.manual_seed(worker_seed)
+
         args = {
             "batch_size": batch_size,
-            "num_workers": 0 if in_memory else num_workers,
-            "pin_memory": True
+            "pin_memory": True,
+            "worker_init_fn": worker_init_fn,
+            "sampler": sampler,
+            "batch_sampler": batch_sampler,
+            "generator": g,
         }
+
+        if hasattr(dataset, 'in_memory'):
+            args['num_workers'] = 0 if dataset.in_memory else num_workers
+        else:
+            args['num_workers'] = num_workers
+
         if use_subset is not None:
             from torch.utils.data.sampler import SubsetRandomSampler
             if isinstance(use_subset, int):
@@ -311,10 +377,6 @@ def __init__(self,
             args["sampler"] = SubsetRandomSampler(use_subset)
         else:
             args["shuffle"] = shuffle
-        super(H5DataLoader, self).__init__(
-            _H5Dataset(filepath,
-                       in_memory=in_memory,
-                       unpackbits=unpackbits,
-                       sequence_key=sequence_key,
-                       targets_key=targets_key),
-            **args)
+
+        super(H5DataLoader, self).__init__(dataset, **args)
+
diff --git a/selene_sdk/samplers/file_samplers/mat_file_sampler.py b/selene_sdk/samplers/file_samplers/mat_file_sampler.py
@@ -240,13 +240,12 @@ def get_data_and_targets(self, batch_size, n_samples=None):
         sequences_and_targets = []
         targets_mat = []
 
-        count = 0
-        while count < n_samples:
-            sample_size = min(n_samples - count, batch_size)
-            seqs, tgts = self.sample(batch_size=sample_size)
-            sequences_and_targets.append((seqs, tgts))
-            targets_mat.append(tgts)
-            count += sample_size
+        for ix in range(0, n_samples, batch_size):
+            s = ix
+            e = min(ix+batch_size, n_samples)
+            seqs, tgts = self.sample(batch_size=batch_size)
+            sequences_and_targets.append((seqs[:e-s], tgts[:e-s]))
+            targets_mat.append(tgts[:e-s])
 
         # TODO: should not assume targets are always integers
         targets_mat = np.vstack(targets_mat).astype(float)

diff --git a/selene_sdk/samplers/multi_sampler.py b/selene_sdk/samplers/multi_sampler.py
@@ -221,6 +221,7 @@ def sample(self, batch_size=1, mode=None):
             except StopIteration:
                 #If DataLoader iterator reaches its length, reinitialize
                 self._iterators[mode] = iter(self._dataloaders[mode])
+
                 data, targets = next(self._iterators[mode])
                 return data.numpy(), targets.numpy()
 
@@ -260,16 +261,11 @@ def get_data_and_targets(self, batch_size, n_samples=None, mode=None):
             self._set_batch_size(batch_size, mode=mode)
             data_and_targets = []
             targets_mat = []
-            count = batch_size
-            while count < n_samples:
-                data, tgts = self.sample(batch_size=batch_size, mode=mode)
-                data_and_targets.append((data, tgts))
-                targets_mat.append(tgts)
-                count += batch_size
-            remainder = batch_size - (count - n_samples)
-            data, tgts = self.sample(batch_size=remainder)
-            data_and_targets.append((data, tgts))
-            targets_mat.append(tgts)
+            for s in range(0, n_samples, batch_size):
+                e = min(n_samples, s+batch_size)
+                data, targets = self.sample(batch_size=batch_size, mode=mode)
+                data_and_targets.append((data[:e-s], targets[:e-s]))
+                targets_mat.append(targets[:e-s])
             targets_mat = np.vstack(targets_mat)
             return data_and_targets, targets_mat