From 79773021b8b43841b46f46d250b212e48fde6c0d Mon Sep 17 00:00:00 2001
From: gwirn <71886945+gwirn@users.noreply.github.com>
Date: Wed, 5 Jun 2024 15:08:24 +0200
Subject: [PATCH] updated examples for new PDBData class

---
 examples/README.md                         |   1 +
 examples/analysis_example.py               |   9 +-
 examples/bb_example_subclassing_trainer.py | 168 ++++++++++++---------
 examples/bb_foldingnet_basic.py            |  48 +++---
 examples/bb_many_subclassing_examples.py   |  20 +--
 examples/prepare_example.py                |   2 +-
 src/molearn/data/prepare.py                |   6 +-
 7 files changed, 148 insertions(+), 106 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 68ea172..a6ade67 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -10,6 +10,7 @@ https://journals.aps.org/prx/abstract/10.1103/PhysRevX.11.011052).
 #### Training data
 
 The files `MurD_closed.tar.gz` and `MurD_open.tar.gz` contain 900 conformations each of MurD, generated with MD simulations of its closed and open state. Extracting these files will yield `MurD_closed_selection.pdb` and `MurD_open_selection.pdb`.
+In order to use them as training data please run the `prepare_example.py` in order to obtain a joined and prepared trajectory.
 
 #### Test data
 
diff --git a/examples/analysis_example.py b/examples/analysis_example.py
index cb3114a..fa2568c 100644
--- a/examples/analysis_example.py
+++ b/examples/analysis_example.py
@@ -1,5 +1,8 @@
 import torch
 import os
+# import sys
+
+# sys.path.insert(0, os.path.join(os.path.abspath(os.pardir), "src"))
 from molearn.models.foldingnet import AutoEncoder
 from molearn.analysis import MolearnAnalysis
 from molearn.data import PDBData
@@ -40,8 +43,10 @@ def main():
     # by defining the manual see and loading the dataset in the same order as when
     # the neural network was trained, the same train-test split will be obtained
     data = PDBData()
-    data.import_pdb(f"data{os.sep}MurD_closed_selection.pdb")
-    data.import_pdb(f"data{os.sep}MurD_open_selection.pdb")
+    data.import_pdb(
+        "./clustered/MurD_open_selection_CLUSTER_aggl_train.dcd",
+        "./clustered/MurD_open_selection_NEW_TOPO.pdb",
+    )
     data.fix_terminal()
     data.atomselect(atoms=["CA", "C", "N", "CB", "O"])
     data.prepare_dataset()
diff --git a/examples/bb_example_subclassing_trainer.py b/examples/bb_example_subclassing_trainer.py
index 0555359..56b3887 100644
--- a/examples/bb_example_subclassing_trainer.py
+++ b/examples/bb_example_subclassing_trainer.py
@@ -1,5 +1,7 @@
-import sys, os
-sys.path.insert(0, os.path.join(os.path.abspath(os.pardir),'src'))
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.abspath(os.pardir), "src"))
 from molearn.data import PDBData
 from molearn.trainers import OpenMM_Physics_Trainer
 from molearn.models.foldingnet import AutoEncoder
@@ -10,122 +12,144 @@
 
 
 class CustomTrainer(OpenMM_Physics_Trainer):
-#### All commented out sections are not needed, they are there for demonstration purposes    
+    #### All commented out sections are not needed, they are there for demonstration purposes
     ### This is what common_step looks like in Trainer ###
-#        def common_step(self, batch):
-#        self._internal = {}
-#        encoded = self.autoencoder.encode(batch)
-#        self._internal['encoded'] = encoded
-#        decoded = self.autoencoder.decode(encoded)[:,:,:batch.size(2)]
-#        self._internal['decoded'] = decoded
-#        return dict(mse_loss = ((batch-decoded)**2).mean())
+    #        def common_step(self, batch):
+    #        self._internal = {}
+    #        encoded = self.autoencoder.encode(batch)
+    #        self._internal['encoded'] = encoded
+    #        decoded = self.autoencoder.decode(encoded)[:,:,:batch.size(2)]
+    #        self._internal['decoded'] = decoded
+    #        return dict(mse_loss = ((batch-decoded)**2).mean())
 
     ### This is what common_physics_step looks like in OpenMM_Physics_Trainer ###
-#    def common_physics_step(self, batch, latent):
-#        alpha = torch.rand(int(len(batch)//2), 1, 1).type_as(latent)
-#        latent_interpolated = (1-alpha)*latent[:-1:2] + alpha*latent[1::2]
-#        generated = self.autoencoder.decode(latent_interpolated)[:,:,:batch.size(2)]
-#        self._internal['generated'] = generated
-#        energy = self.physics_loss(generated)
-#        energy[energy.isinf()]=1e35
-#        energy = torch.clamp(energy, max=1e34)
-#        energy = energy.nanmean()
-#        return {'physics_loss':energy}#a if not energy.isinf() else torch.tensor(0.0)}
+    #    def common_physics_step(self, batch, latent):
+    #        alpha = torch.rand(int(len(batch)//2), 1, 1).type_as(latent)
+    #        latent_interpolated = (1-alpha)*latent[:-1:2] + alpha*latent[1::2]
+    #        generated = self.autoencoder.decode(latent_interpolated)[:,:,:batch.size(2)]
+    #        self._internal['generated'] = generated
+    #        energy = self.physics_loss(generated)
+    #        energy[energy.isinf()]=1e35
+    #        energy = torch.clamp(energy, max=1e34)
+    #        energy = energy.nanmean()
+    #        return {'physics_loss':energy}#a if not energy.isinf() else torch.tensor(0.0)}
 
     ### This is what valid_step looks like in OpenMM_Physics_Trainer ###
-#    def valid_step(self, batch):
-#        results = self.common_step(batch)
-#        results.update(self.common_physics_step(batch, self._internal['encoded']))
-#        scale = (self.psf*results['mse_loss'])/(results['physics_loss'] +1e-5)
-#        final_loss = torch.log(results['mse_loss'])+scale*torch.log(results['physics_loss']
-#        results['loss'] = final_loss
-#        return results
+    #    def valid_step(self, batch):
+    #        results = self.common_step(batch)
+    #        results.update(self.common_physics_step(batch, self._internal['encoded']))
+    #        scale = (self.psf*results['mse_loss'])/(results['physics_loss'] +1e-5)
+    #        final_loss = torch.log(results['mse_loss'])+scale*torch.log(results['physics_loss']
+    #        results['loss'] = final_loss
+    #        return results
 
     def valid_step(self, batch):
-        results  = super().valid_step(batch)
-        #rmsd 
-        rmsd = (((batch-self._internal['decoded'])*self.std)**2).sum(dim=1).mean().sqrt()
-        results['RMSD'] = rmsd # 'valid_' will automatically be prepended onto this in valid_epoch, to distinguish it from train_step
+        results = super().valid_step(batch)
+        # rmsd
+        rmsd = (
+            (((batch - self._internal["decoded"]) * self.std) ** 2)
+            .sum(dim=1)
+            .mean()
+            .sqrt()
+        )
+        results["RMSD"] = (
+            rmsd  # 'valid_' will automatically be prepended onto this in valid_epoch, to distinguish it from train_step
+        )
 
-        #calculate some dope
+        # calculate some dope
         if self.first_valid_step:
             self.first_valid_step = False
-            if not hasattr(self, 'dope_score_class'):
-                self.dope_score_class = Parallel_DOPE_Score(self.mol,processes=torch.get_num_threads())
-            #Calculated dope of decoded structures 
+            if not hasattr(self, "dope_score_class"):
+                self.dope_score_class = Parallel_DOPE_Score(
+                    self.mol, processes=torch.get_num_threads()
+                )
+            # Calculated dope of decoded structures
             self.dope_scores = []
-            decoded_batch = (self._internal['decoded'].permute(0,2,1)*self.std).data.cpu().numpy()
+            decoded_batch = (
+                (self._internal["decoded"].permute(0, 2, 1) * self.std)
+                .data.cpu()
+                .numpy()
+            )
             for f in decoded_batch:
                 if np.isfinite(f).all():
-                    self.dope_scores.append(self.dope_score_class.get_score(f,refine='both'))
+                    self.dope_scores.append(
+                        self.dope_score_class.get_score(f, refine="both")
+                    )
 
-            #Calcutate dope of interpolated/generated structures
+            # Calcutate dope of interpolated/generated structures
             self.interp_dope_scores = []
-            interpolated_batch = (self._internal['generated'].permute(0,2,1)*self.std).data.cpu().numpy()
+            interpolated_batch = (
+                (self._internal["generated"].permute(0, 2, 1) * self.std)
+                .data.cpu()
+                .numpy()
+            )
             for f in interpolated_batch:
                 if np.isfinite(f).all():
-                    self.interp_dope_scores.append(self.dope_score_class.get_score(f,refine='both'))
+                    self.interp_dope_scores.append(
+                        self.dope_score_class.get_score(f, refine="both")
+                    )
             # These will calculate in the background, synchronize at the end of the epoch.
         return results
 
     def valid_epoch(self, *args, **kwargs):
-        self.first_valid_step = self.epoch%5==0
+        self.first_valid_step = self.epoch % 5 == 0
         results = super().valid_epoch(*args, **kwargs)
 
         # Might as well keep track of cuda memomry once an epoch
-        memory = torch.cuda.max_memory_allocated()/1000000.0
-        results['Memory'] = memory
+        memory = torch.cuda.max_memory_allocated() / 1000000.0
+        results["Memory"] = memory
 
-        if self.epoch%5==0:
+        if self.epoch % 5 == 0:
             t1 = time()
-            #self.dope_scores contains multiprocessing result objects, get the results
-            #This will synchronize the code
+            # self.dope_scores contains multiprocessing result objects, get the results
+            # This will synchronize the code
             dope = np.array([r.get() for r in self.dope_scores])
             idope = np.array([r.get() for r in self.interp_dope_scores])
 
-            #Dope score returns (DOPE_score, refined_DOPE_score), might as well log both
-            results['valid_DOPE'] = dope[:,0].mean()
-            results['valid_DOPE_refined'] = dope[:,1].mean()
-            results['valid_DOPE_interp'] = idope[:,0].mean()
-            results['valid_DOPE_interp_refined'] = idope[:,1].mean()
-            results['valid_DOPE_time'] = time()-t1 # extra time taken to calculate DOPE
+            # Dope score returns (DOPE_score, refined_DOPE_score), might as well log both
+            results["valid_DOPE"] = dope[:, 0].mean()
+            results["valid_DOPE_refined"] = dope[:, 1].mean()
+            results["valid_DOPE_interp"] = idope[:, 0].mean()
+            results["valid_DOPE_interp_refined"] = idope[:, 1].mean()
+            results["valid_DOPE_time"] = (
+                time() - t1
+            )  # extra time taken to calculate DOPE
         return results
 
 
-
-if __name__ == '__main__':
-
+if __name__ == "__main__":
     ##### Load Data #####
     data = PDBData()
-    data.import_pdb('data/MurD_closed_selection.pdb')
-    data.import_pdb('data/MurD_open_selection.pdb')
+    data.import_pdb(
+        "./clustered/MurD_open_selection_CLUSTER_aggl_train.dcd",
+        "./clustered/MurD_open_selection_NEW_TOPO.pdb",
+    )
     data.fix_terminal()
-    data.atomselect(atoms = ['CA', 'C', 'N', 'CB', 'O'])
+    data.atomselect(atoms=["CA", "C", "N", "CB", "O"])
 
     ##### Prepare Trainer #####
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     trainer = CustomTrainer(device=device)
 
-    trainer.set_data(data, batch_size=8, validation_split=0.1, manual_seed = 25)
-    trainer.prepare_physics(remove_NB = True)
-    
-    trainer.set_autoencoder(AutoEncoder, out_points = data.dataset.shape[-1])
-    trainer.prepare_optimiser()
+    trainer.set_data(data, batch_size=8, validation_split=0.1, manual_seed=25)
+    trainer.prepare_physics(remove_NB=True)
 
+    trainer.set_autoencoder(AutoEncoder, out_points=data.dataset.shape[-1])
+    trainer.prepare_optimiser()
 
     ##### Training Loop #####
-    #Keep training until loss does not improve for 32 consecutive epochs
+    # Keep training until loss does not improve for 32 consecutive epochs
 
     runkwargs = dict(
-        log_filename='log_file.dat',
-        log_folder='xbb_foldingnet_checkpoints',
-        checkpoint_folder='xbb_foldingnet_checkpoints',
-        )
+        log_filename="log_file.dat",
+        log_folder="xbb_foldingnet_checkpoints",
+        checkpoint_folder="xbb_foldingnet_checkpoints",
+    )
 
     best = 1e24
     while True:
-        trainer.run(max_epochs = 32+trainer.epoch,**runkwargs)
-        if not best>trainer.best:
+        trainer.run(max_epochs=32 + trainer.epoch, **runkwargs)
+        if not best > trainer.best:
             break
         best = trainer.best
-    print(f'best {trainer.best}, best_filename {trainer.best_name}')
+    print(f"best {trainer.best}, best_filename {trainer.best_name}")
diff --git a/examples/bb_foldingnet_basic.py b/examples/bb_foldingnet_basic.py
index 8f620fe..5dea4f5 100644
--- a/examples/bb_foldingnet_basic.py
+++ b/examples/bb_foldingnet_basic.py
@@ -1,44 +1,50 @@
-import sys, os
-sys.path.insert(0, os.path.join(os.path.abspath(os.pardir),'src'))
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.abspath(os.pardir), "src"))
 from molearn.data import PDBData
 from molearn.trainers import OpenMM_Physics_Trainer
 from molearn.models.foldingnet import AutoEncoder
 import torch
 
 
-if __name__ == '__main__':
-
+def main():
     ##### Load Data #####
     data = PDBData()
-    data.import_pdb('data/MurD_closed_selection.pdb')
-    data.import_pdb('data/MurD_open_selection.pdb')
+    data.import_pdb(
+        "./clustered/MurD_open_selection_CLUSTER_aggl_train.dcd",
+        "./clustered/MurD_open_selection_NEW_TOPO.pdb",
+    )
     data.fix_terminal()
-    data.atomselect(atoms = ['CA', 'C', 'N', 'CB', 'O'])
+    data.atomselect(atoms=["CA", "C", "N", "CB", "O"])
 
     ##### Prepare Trainer #####
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     trainer = OpenMM_Physics_Trainer(device=device)
 
-    trainer.set_data(data, batch_size=8, validation_split=0.1, manual_seed = 25)
-    trainer.prepare_physics(remove_NB = True)
-    
-    trainer.set_autoencoder(AutoEncoder, out_points = data.dataset.shape[-1])
-    trainer.prepare_optimiser()
+    trainer.set_data(data, batch_size=8, validation_split=0.1, manual_seed=25)
+    trainer.prepare_physics(remove_NB=True)
 
+    trainer.set_autoencoder(AutoEncoder, out_points=data.dataset.shape[-1])
+    trainer.prepare_optimiser()
 
     ##### Training Loop #####
-    #Keep training until loss does not improve for 32 consecutive epochs
+    # Keep training until loss does not improve for 32 consecutive epochs
 
     runkwargs = dict(
-        log_filename='log_file.dat',
-        log_folder='xbb_foldingnet_checkpoints',
-        checkpoint_folder='xbb_foldingnet_checkpoints',
-        )
+        log_filename="log_file.dat",
+        log_folder="xbb_foldingnet_checkpoints",
+        checkpoint_folder="xbb_foldingnet_checkpoints",
+    )
 
     best = 1e24
     while True:
-        trainer.run(max_epochs = 32+trainer.epoch,**runkwargs)
-        if not best>trainer.best:
+        trainer.run(max_epochs=32 + trainer.epoch, **runkwargs)
+        if not best > trainer.best:
             break
         best = trainer.best
-    print(f'best {trainer.best}, best_filename {trainer.best_name}')
+    print(f"best {trainer.best}, best_filename {trainer.best_name}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/bb_many_subclassing_examples.py b/examples/bb_many_subclassing_examples.py
index 74cc2a0..05b1897 100644
--- a/examples/bb_many_subclassing_examples.py
+++ b/examples/bb_many_subclassing_examples.py
@@ -15,14 +15,14 @@ class ValidRMSDTrainer(OpenMM_Physics_Trainer):
     '''
     def valid_step(self, batch):
         results  = super().valid_step(batch)
-        #rmsd 
+        #rmsd
         rmsd = (((batch-self._internal['decoded'])*self.std)**2).sum(dim=1).mean().sqrt()
         results['RMSD'] = rmsd # 'valid_' will automatically be prepended onto this in trainer.valid_epoch, to distinguish it from train_step
         return results
 
 class ValidDOPETrainer(OpenMM_Physics_Trainer):
     '''
-    calculate  DOPE, this might slow your code down significantly so keep an eye on the 'valid_DOPE_extra_time' 
+    calculate  DOPE, this might slow your code down significantly so keep an eye on the 'valid_DOPE_extra_time'
     '''
     def __init__(self,calculate_dope_every_n = 4, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -30,7 +30,7 @@ def __init__(self,calculate_dope_every_n = 4, *args, **kwargs):
 
     def valid_epoch(self, *args, **kwargs):
         '''
-        Override valid_epoch, (make sure to call super().valid_epoch(*args, **kwargs) to keep default behavior too). 
+        Override valid_epoch, (make sure to call super().valid_epoch(*args, **kwargs) to keep default behavior too).
         '''
         # DOPE Calculations are slow, so you probably won't want to calculate them every epoch. Instead every 4 to 16 epochs.
         self.calculate_dope = self.epoch%self.calculate_dope_every==0
@@ -50,7 +50,7 @@ def valid_epoch(self, *args, **kwargs):
 
     def valid_step(self, batch):
         '''
-        Keep default behavior by calling super().valid_step(batch), submit jobs to a multiprocessing pool with Parallel_DOPE_Score, then get results back at the end of the epoch in trainer.valid_epoch               
+        Keep default behavior by calling super().valid_step(batch), submit jobs to a multiprocessing pool with Parallel_DOPE_Score, then get results back at the end of the epoch in trainer.valid_epoch
         '''
         results = super().valid_step(batch)
         if not hasattr(self, 'dope_score_class'):
@@ -79,7 +79,7 @@ def valid_epoch(self, *args, **kwargs):
         torch.cuda.reset_max_memory_allocated()
         results['Memory'] = memory
         return results
-        
+
 class DisablePhysicsTrainer(OpenMM_Physics_Trainer):
     '''
     Disable Physics in train_step. This essentially resets the train_step to that of Molearn.trainers.Trainer.
@@ -100,7 +100,7 @@ def train_step(self, batch):
 
 class CalculateDecodedEnergyTrainer(OpenMM_Physics_Trainer):
     '''
-    OpenMM_Physics_Trainer calculates the energy off interpolated structures. It might be helpful to track the energy of decoded structures too. 
+    OpenMM_Physics_Trainer calculates the energy off interpolated structures. It might be helpful to track the energy of decoded structures too.
     '''
     def common_physics_step(self, batch, latent):
         # calculate interpolated structures energy as normal
@@ -124,8 +124,10 @@ def common_physics_step(self, batch, latent):
 
     ##### Load Data #####
     data = PDBData()
-    data.import_pdb('data/MurD_closed_selection.pdb')
-    data.import_pdb('data/MurD_open_selection.pdb')
+    data.import_pdb(
+        "./clustered/MurD_open_selection_CLUSTER_aggl_train.dcd",
+        "./clustered/MurD_open_selection_NEW_TOPO.pdb",
+    )
     data.fix_terminal()
     data.atomselect(atoms = ['CA', 'C', 'N', 'CB', 'O'])
 
@@ -135,7 +137,7 @@ def common_physics_step(self, batch, latent):
 
     trainer.set_data(data, batch_size=8, validation_split=0.1, manual_seed = 25)
     trainer.prepare_physics(remove_NB = True)
-    
+
     trainer.set_autoencoder(AutoEncoder, out_points = data.dataset.shape[-1])
     trainer.prepare_optimiser()
 
diff --git a/examples/prepare_example.py b/examples/prepare_example.py
index 266afd7..6e4dfca 100644
--- a/examples/prepare_example.py
+++ b/examples/prepare_example.py
@@ -21,7 +21,7 @@ def main():
             "./data/preparation/topo_MurDclosed1F.pdb",
         ],
         test_size=0.0,
-        n_cluster=5,
+        n_cluster=1500,
         outpath=storage_path,
         verbose=True,
     )
diff --git a/src/molearn/data/prepare.py b/src/molearn/data/prepare.py
index b5ea9b9..8a43903 100644
--- a/src/molearn/data/prepare.py
+++ b/src/molearn/data/prepare.py
@@ -142,7 +142,9 @@ def read_traj(self) -> None:
             ), "there must be as many topologies supplied as trajectories"
             multi_traj = []
             top0 = None
+            ucell0 = None
             for ct, (trp, top) in enumerate(zip(self.traj_path, self.topo_path)):
+                print(f"\tLoading Trajectory {ct}")
                 loaded = None
                 try:
                     # do not enforce topology file on this formats
@@ -155,15 +157,17 @@ def read_traj(self) -> None:
                     loaded = self._loading_fallback(trp, top)
                 # select only protein atoms from the trajectory
                 loaded = loaded.atom_slice(loaded.top.select("protein"))
-                # to be able to join them
+                # use topology and unit cell from first trajectory to be able to join them
                 if ct == 0:
                     top0 = loaded.topology
+                    ucell0 = loaded.unitcell_vectors
                 else:
                     if top0.n_atoms != loaded.n_atoms:
                         raise ValueError(
                             f"topologies do not match - topology [{ct}] has {loaded.n_atoms} instead of {top0.n_atoms}"
                         )
                     loaded.topology = top0
+                    loaded.unitcell_vectors = ucell0
                 multi_traj.append(loaded)
             self.traj = md.join(multi_traj)
         # converts ELEMENT names from eg "Cd" -> "C" to avoid later complications